| 1 | """ | 1 | """ |
|---|
| 2 | Concordance (and statistics) for texts in database. | 2 | Concordance (and statistics) for texts in database. |
|---|
| 3 | | 3 | |
|---|
| 4 | To build concordance use ConcordanceBuilder. To access concordance/statistics | 4 | To build concordance use ConcordanceBuilder. To access concordance/statistics |
|---|
| 5 | use Concordance/Statistics class. Concordance and statistics are provided as | 5 | use Concordance/Statistics class. Concordance and statistics are provided as |
|---|
| 6 | dictionaries keyed by words. | 6 | dictionaries keyed by words. |
|---|
| 7 | | 7 | |
|---|
| 8 | NB: all word keys have been lower-cased in order to render them | 8 | NB: all word keys have been lower-cased in order to render them |
|---|
| 9 | case-insensitive | 9 | case-insensitive |
|---|
| 10 | """ | 10 | """ |
|---|
| 11 | import re | 11 | import re |
|---|
| 12 | | 12 | |
|---|
| 13 | import sqlobject | 13 | import sqlobject |
|---|
| 14 | | 14 | |
|---|
| 15 | import shakespeare.index | 15 | import shakespeare.index |
|---|
| 16 | import shakespeare.cache | 16 | import shakespeare.cache |
|---|
| 17 | | 17 | |
|---|
| 18 | | 18 | |
|---|
| 19 | class ConcordanceBase(object): | 19 | class ConcordanceBase(object): |
|---|
| 20 | """ | 20 | """ |
|---|
| 21 | TODO: caching?? | 21 | TODO: caching?? |
|---|
| 22 | """ | 22 | """ |
|---|
| 23 | sqlcc = shakespeare.dm.Concordance | 23 | sqlcc = shakespeare.dm.Concordance |
|---|
| 24 | sqlstat = shakespeare.dm.Statistic | 24 | sqlstat = shakespeare.dm.Statistic |
|---|
| 25 | | 25 | |
|---|
| 26 | def __init__(self, filter_names=None): | 26 | def __init__(self, filter_names=None): |
|---|
| 27 | """ | 27 | """ |
|---|
| 28 | @param filter_names: a list of id names with which to filter results | 28 | @param filter_names: a list of id names with which to filter results |
|---|
| 29 | (i.e. only return results relating to those texts) | 29 | (i.e. only return results relating to those texts) |
|---|
| 30 | """ | 30 | """ |
|---|
| 31 | self._filter_names = filter_names | 31 | self._filter_names = filter_names |
|---|
| 32 | self.sqlcc_filter = self._make_filter(self.sqlcc) | 32 | self.sqlcc_filter = self._make_filter(self.sqlcc) |
|---|
| 33 | self.sqlstat_filter = self._make_filter(self.sqlstat) | 33 | self.sqlstat_filter = self._make_filter(self.sqlstat) |
|---|
| 34 | | 34 | |
|---|
| 35 | def _make_filter(self, sqlobj): | 35 | def _make_filter(self, sqlobj): |
|---|
| 36 | sql_filter = True | 36 | sql_filter = True |
|---|
| 37 | if self._filter_names is not None: | 37 | if self._filter_names is not None: |
|---|
| 38 | arglist = [] | 38 | arglist = [] |
|---|
| 39 | for name in self._filter_names: | 39 | for name in self._filter_names: |
|---|
| 40 | newarg = sqlobj.q.textID == self._name2id(name) | 40 | newarg = sqlobj.q.textID == self._name2id(name) |
|---|
| 41 | arglist.append(newarg) | 41 | arglist.append(newarg) |
|---|
| 42 | sql_filter = sqlobject.OR(*arglist) | 42 | sql_filter = sqlobject.OR(*arglist) |
|---|
| 43 | return sql_filter | 43 | return sql_filter |
|---|
| 44 | | 44 | |
|---|
| 45 | def _name2id(self, name): | 45 | def _name2id(self, name): |
|---|
| 46 | return shakespeare.dm.Material.byName(name).id | 46 | return shakespeare.dm.Material.byName(name).id |
|---|
| 47 | | 47 | |
|---|
| 48 | def keys(self): | 48 | def keys(self): |
|---|
| 49 | """Return list of *distinct* words in concordance/statistics | 49 | """Return list of *distinct* words in concordance/statistics |
|---|
| 50 | """ | 50 | """ |
|---|
| 51 | all = self.sqlstat.select(self.sqlstat_filter, | 51 | all = self.sqlstat.select(self.sqlstat_filter, |
|---|
| 52 | orderBy=self.sqlstat.q.word, | 52 | orderBy=self.sqlstat.q.word, |
|---|
| 53 | ) | 53 | ) |
|---|
| 54 | words = [ xx.word for xx in list(all) ] | 54 | words = [ xx.word for xx in list(all) ] |
|---|
| 55 | distinct = list(set(words)) | 55 | distinct = list(set(words)) |
|---|
| 56 | distinct.sort() | 56 | distinct.sort() |
|---|
| 57 | return distinct | 57 | return distinct |
|---|
| 58 | | 58 | |
|---|
| 59 | | 59 | |
|---|
| 60 | class Concordance(ConcordanceBase): | 60 | class Concordance(ConcordanceBase): |
|---|
| 61 | """Concordance by word for a set of texts | 61 | """Concordance by word for a set of texts |
|---|
| 62 | """ | 62 | """ |
|---|
| 63 | | 63 | |
|---|
| 64 | def get(self, word): | 64 | def get(self, word): |
|---|
| 65 | """Get list of occurrences for word | 65 | """Get list of occurrences for word |
|---|
| 66 | @return: sqlobject query list | 66 | @return: sqlobject query list |
|---|
| 67 | """ | 67 | """ |
|---|
| 68 | select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) | 68 | select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) |
|---|
| 69 | return select | 69 | return select |
|---|
| 70 | | 70 | |
|---|
| 71 | class Statistics(ConcordanceBase): | 71 | class Statistics(ConcordanceBase): |
|---|
| 72 | | 72 | |
|---|
| 73 | def get(self, word): | 73 | def get(self, word): |
|---|
| 74 | select = self.sqlstat.select( | 74 | select = self.sqlstat.select( |
|---|
| 75 | sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) | 75 | sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) |
|---|
| 76 | ) | 76 | ) |
|---|
| 77 | total = 0 | 77 | total = 0 |
|---|
| 78 | for stat in select: | 78 | for stat in select: |
|---|
| 79 | total += stat.occurrences | 79 | total += stat.occurrences |
|---|
| 80 | return total | 80 | return total |
|---|
| 81 | | 81 | |
|---|
| 82 | class ConcordanceBuilder(object): | 82 | class ConcordanceBuilder(object): |
|---|
| 83 | """Build a concordance and associated statistics for a set of texts. | 83 | """Build a concordance and associated statistics for a set of texts. |
|---|
| 84 | | 84 | |
|---|
| 85 | """ | 85 | """ |
|---|
| 86 | | 86 | |
|---|
| 87 | # multiline, unicode and ignorecase | 87 | # multiline, unicode and ignorecase |
|---|
| 88 | word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) | 88 | word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) |
|---|
| 89 | | 89 | |
|---|
| 90 | words_to_ignore = [ | 90 | words_to_ignore = [ |
|---|
| 93 | | 116 | |
|---|
| 94 | def _text_already_done(self, text): | 117 | def _text_already_done(self, text): |
|---|
| 95 | numrecs = shakespeare.dm.Concordance.select( | 118 | numrecs = shakespeare.dm.Concordance.select( |
|---|
| 96 | shakespeare.dm.Concordance.q.textID==text.id | 119 | shakespeare.dm.Concordance.q.textID==text.id |
|---|
| 97 | ).count() | 120 | ).count() |
|---|
| 98 | return numrecs > 0 | 121 | return numrecs > 0 |
|---|
| 99 | | 122 | |
|---|
| 100 | def add_text(self, name, text=None): | 123 | def add_text(self, name, text=None): |
|---|
| 101 | """Add a text to the concordance. | 124 | """Add a text to the concordance. |
|---|
| 102 | @param name: name of text to add | 125 | @param name: name of text to add |
|---|
| 103 | @param text: [optional] a file-like object containing text data. If not | 126 | @param text: [optional] a file-like object containing text data. If not |
|---|
| 104 | provided will default to using file in cache associated with named | 127 | provided will default to using file in cache associated with named |
|---|
| 105 | text | 128 | text |
|---|
| 106 | """ | 129 | """ |
|---|
| 107 | dmText = shakespeare.dm.Material.byName(name) | 130 | dmText = shakespeare.dm.Material.byName(name) |
|---|
| 108 | if self._text_already_done(dmText): | 131 | if self._text_already_done(dmText): |
|---|
| 109 | msg = 'Have already added to concordance text: %s' % dmText | 132 | msg = 'Have already added to concordance text: %s' % dmText |
|---|
| 110 | # raise ValueError(msg) | 133 | # raise ValueError(msg) |
|---|
| 111 | print msg | 134 | print msg |
|---|
| 112 | print 'Skipping' | 135 | print 'Skipping' |
|---|
| 113 | return | 136 | return |
|---|
| 114 | if text is None: | 137 | if text is None: |
|---|
| 115 | tpath = dmText.get_cache_path('plain') | 138 | tpath = dmText.get_cache_path('plain') |
|---|
| 116 | text = file(tpath) | 139 | text = file(tpath) |
|---|
| 117 | lineCount = 0 | 140 | lineCount = 0 |
|---|
| 118 | charIndex = 0 | 141 | charIndex = 0 |
|---|
| 119 | stats = {} | 142 | stats = {} |
|---|
| 120 | trans = shakespeare.dm.Concordance._connection.transaction() | 143 | trans = shakespeare.dm.Concordance._connection.transaction() |
|---|
| 121 | for line in text.readlines(): | 144 | for line in text.readlines(): |
|---|
| 122 | for match in self.word_regex.finditer(line): | 145 | for match in self.word_regex.finditer(line): |
|---|
| 123 | word = match.group().lower() # case insensitive | 146 | word = match.group().lower() # case insensitive |
|---|
| 125 | continue | 148 | continue |
|---|
| 126 | shakespeare.dm.Concordance(connection=trans, | 149 | shakespeare.dm.Concordance(connection=trans, |
|---|
| 127 | text=dmText, | 150 | text=dmText, |
|---|
| 128 | word=word, | 151 | word=word, |
|---|
| 129 | line=lineCount, | 152 | line=lineCount, |
|---|
| 130 | char_index=charIndex+match.start()) | 153 | char_index=charIndex+match.start()) |
|---|
| 131 | stats[word] = stats.get(word, 0) + 1 | 154 | stats[word] = stats.get(word, 0) + 1 |
|---|
| 132 | lineCount += 1 | 155 | lineCount += 1 |
|---|
| 133 | charIndex += len(line) | 156 | charIndex += len(line) |
|---|
| 134 | trans.commit() | 157 | trans.commit() |
|---|
| 135 | trans = shakespeare.dm.Concordance._connection.transaction() | 158 | trans = shakespeare.dm.Concordance._connection.transaction() |
|---|
| 136 | for word, value in stats.items(): | 159 | for word, value in stats.items(): |
|---|
| 137 | tresults = shakespeare.dm.Statistic.select( | 160 | tresults = shakespeare.dm.Statistic.select( |
|---|
| 138 | sqlobject.AND( | 161 | sqlobject.AND( |
|---|
| 139 | shakespeare.dm.Statistic.q.textID == dmText.id, | 162 | shakespeare.dm.Statistic.q.textID == dmText.id, |
|---|
| 140 | shakespeare.dm.Statistic.q.word == word | 163 | shakespeare.dm.Statistic.q.word == word |
|---|
| 141 | )) | 164 | )) |
|---|
| 142 | try: | 165 | try: |
|---|
| 143 | dbstat = list(tresults)[0] | 166 | dbstat = list(tresults)[0] |
|---|
| 144 | dbstat.occurrences += value | 167 | dbstat.occurrences += value |
|---|
| 145 | except: | 168 | except: |
|---|
| 146 | shakespeare.dm.Statistic( | 169 | shakespeare.dm.Statistic( |
|---|
| 147 | connection=trans, | 170 | connection=trans, |
|---|
| 148 | text=dmText, | 171 | text=dmText, |
|---|
| 149 | word=word, | 172 | word=word, |
|---|
| 150 | occurrences=value | 173 | occurrences=value |
|---|
| 151 | ) | 174 | ) |
|---|
| 152 | trans.commit() | 175 | trans.commit() |
|---|
| 153 | | 176 | |
|---|
| 154 | | 177 | |
|---|
| 155 | def remove_text(self, name): | 178 | def remove_text(self, name): |
|---|
| 156 | """Remove a text from the concordance. | 179 | """Remove a text from the concordance. |
|---|
| 157 | | 180 | |
|---|
| 158 | @param name: as for add_text | 181 | @param name: as for add_text |
|---|
| 159 | """ | 182 | """ |
|---|
| 160 | dmText = shakespeare.dm.Material.byName(name) | 183 | dmText = shakespeare.dm.Material.byName(name) |
|---|
| 161 | recs = shakespeare.dm.Concordance.select( | 184 | recs = shakespeare.dm.Concordance.select( |
|---|
| 162 | shakespeare.dm.Concordance.q.textID==dmText.id | 185 | shakespeare.dm.Concordance.q.textID==dmText.id |
|---|
| 163 | ) | 186 | ) |
|---|
| 164 | for rec in recs: | 187 | for rec in recs: |
|---|
| 165 | shakespeare.dm.Concordance.delete(rec.id) | 188 | shakespeare.dm.Concordance.delete(rec.id) |
|---|