Changeset 72
- Timestamp:
- 12/27/06 18:09:08 (2 years ago)
- Files:
-
- trunk/src/shakespeare/concordance.py (modified) (1 diff)
- trunk/src/shakespeare/concordance_test.py (modified) (1 diff)
- trunk/src/shakespeare/dm.py (modified) (1 diff)
- trunk/src/shakespeare/dm_test.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/src/shakespeare/concordance.py
Revision 51 Revision 72 1 """ 1 """ 2 Concordance (and statistics) for texts in database. 2 Concordance (and statistics) for texts in database. 3 3 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 5 use Concordance/Statistics class. Concordance and statistics are provided as 5 use Concordance/Statistics class. Concordance and statistics are provided as 6 dictionaries keyed by words. 6 dictionaries keyed by words. 7 7 8 NB: all word keys have been lower-cased in order to render them 8 NB: all word keys have been lower-cased in order to render them 9 case-insensitive 9 case-insensitive 10 """ 10 """ 11 import re 11 import re 12 12 13 import sqlobject 13 import sqlobject 14 14 15 import shakespeare.index 15 import shakespeare.index 16 import shakespeare.cache 16 import shakespeare.cache 17 17 18 18 19 class ConcordanceBase(object): 19 class ConcordanceBase(object): 20 """ 20 """ 21 TODO: caching?? 21 TODO: caching?? 22 """ 22 """ 23 sqlcc = shakespeare.dm.Concordance 23 sqlcc = shakespeare.dm.Concordance 24 sqlstat = shakespeare.dm.Statistic 24 25 25 def __init__(self, filter_names=None): 26 def __init__(self, filter_names=None): 26 """ 27 """ 27 @param filter_names: a list of id names with which to filter results 28 @param filter_names: a list of id names with which to filter results 28 (i.e. only return results relating to those texts) 29 (i.e. only return results relating to those texts) 29 """ 30 """ 30 self._filter_names = filter_names 31 self._filter_names = filter_names 31 # piece of sql to use in select to filter texts 32 self.sqlcc_filter = self._make_filter(self.sqlcc) 32 self._sql_filter = True 33 self.sqlstat_filter = self._make_filter(self.sqlstat) 34 35 def _make_filter(self, sqlobj): 36 sql_filter = True 33 if self._filter_names is not None: 37 if self._filter_names is not None: 34 arglist = [] 38 arglist = [] 35 for name in self._filter_names: 39 for name in self._filter_names: 36 newarg = s elf.sqlcc.q.textID == self._name2id(name)40 newarg = sqlobj.q.textID == self._name2id(name) 37 arglist.append(newarg) 41 arglist.append(newarg) 38 self._sql_filter = sqlobject.OR(*arglist) 42 sql_filter = sqlobject.OR(*arglist) 43 return sql_filter 39 44 40 def _name2id(self, name): 45 def _name2id(self, name): 41 return shakespeare.dm.Material.byName(name).id 46 return shakespeare.dm.Material.byName(name).id 42 47 43 def keys(self): 48 def keys(self): 44 """Return list of words in concordance49 """Return list of *distinct* words in concordance/statistics 45 """ 50 """ 46 # distinct does not help us because we need to DISTINCT word 51 all = self.sqlstat.select(self.sqlstat_filter, 47 # but can't do this with sqlobject 52 orderBy=self.sqlstat.q.word, 48 all = self.sqlcc.select(self._sql_filter, 53 ) 49 orderBy=self.sqlcc.q.word, 50 distinct=True) 51 words = [ xx.word for xx in list(all) ] 54 words = [ xx.word for xx in list(all) ] 52 distinct = list(set(words)) 55 distinct = list(set(words)) 53 distinct.sort() 56 distinct.sort() 54 return distinct 57 return distinct 55 58 56 59 57 class Concordance(ConcordanceBase): 60 class Concordance(ConcordanceBase): 58 """Concordance by word for a set of texts 61 """Concordance by word for a set of texts 59 """ 62 """ 60 63 61 def get(self, word): 64 def get(self, word): 62 """Get list of occurrences for word 65 """Get list of occurrences for word 63 @return: sqlobject query list 66 @return: sqlobject query list 64 """ 67 """ 65 select = self.sqlcc.select(sqlobject.AND(self. _sql_filter, self.sqlcc.q.word==word))68 select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 66 return select 69 return select 67 70 68 class Statistics(ConcordanceBase): 71 class Statistics(ConcordanceBase): 69 72 70 def get(self, word): 73 def get(self, word): 71 select = self.sql cc.select(74 select = self.sqlstat.select( 72 sqlobject.AND(self. _sql_filter, self.sqlcc.q.word==word)75 sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 73 ) 76 ) 74 return select.count() 77 total = 0 78 for stat in select: 79 total += stat.occurrences 80 return total 75 81 76 class ConcordanceBuilder(object): 82 class ConcordanceBuilder(object): 77 """Build a concordance and associated statistics for a set of texts. 83 """Build a concordance and associated statistics for a set of texts. 78 84 79 """ 85 """ 80 86 81 # multiline, unicode and ignorecase 87 # multiline, unicode and ignorecase 82 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 88 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 83 89 84 words_to_ignore = [ 90 words_to_ignore = [ 85 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'd', 'in' 91 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'd', 'in' 86 ] 92 ] 87 93 88 def _text_already_done(self, text): 94 def _text_already_done(self, text): 89 numrecs = shakespeare.dm.Concordance.select( 95 numrecs = shakespeare.dm.Concordance.select( 90 shakespeare.dm.Concordance.q.textID==text.id 96 shakespeare.dm.Concordance.q.textID==text.id 91 ).count() 97 ).count() 92 return numrecs > 0 98 return numrecs > 0 93 99 94 def add_text(self, name, text=None): 100 def add_text(self, name, text=None): 95 """Add a text to the concordance. 101 """Add a text to the concordance. 96 @param name: name of text to add 102 @param name: name of text to add 97 @param text: [optional] a file-like object containing text data. If not 103 @param text: [optional] a file-like object containing text data. If not 98 provided will default to using file in cache associated with named 104 provided will default to using file in cache associated with named 99 text 105 text 100 """ 106 """ 101 dmText = shakespeare.dm.Material.byName(name) 107 dmText = shakespeare.dm.Material.byName(name) 102 if self._text_already_done(dmText): 108 if self._text_already_done(dmText): 103 msg = 'Have already added to concordance text: %s' % dmText 109 msg = 'Have already added to concordance text: %s' % dmText 104 # raise ValueError(msg) 110 # raise ValueError(msg) 105 print msg 111 print msg 106 print 'Skipping' 112 print 'Skipping' 107 return 113 return 108 if text is None: 114 if text is None: 109 tpath = dmText.get_cache_path('plain') 115 tpath = dmText.get_cache_path('plain') 110 text = file(tpath) 116 text = file(tpath) 111 lineCount = 0 117 lineCount = 0 112 charIndex = 0 118 charIndex = 0 119 stats = {} 113 trans = shakespeare.dm.Concordance._connection.transaction() 120 trans = shakespeare.dm.Concordance._connection.transaction() 114 for line in text.readlines(): 121 for line in text.readlines(): 115 for match in self.word_regex.finditer(line): 122 for match in self.word_regex.finditer(line): 116 word = match.group().lower() # case insensitive 123 word = match.group().lower() # case insensitive 117 if word in self.words_to_ignore: 124 if word in self.words_to_ignore: 118 continue 125 continue 119 shakespeare.dm.Concordance(connection=trans, 126 shakespeare.dm.Concordance(connection=trans, 120 text=dmText, 127 text=dmText, 121 word=word, 128 word=word, 122 line=lineCount, 129 line=lineCount, 123 char_index=charIndex+match.start()) 130 char_index=charIndex+match.start()) 131 stats[word] = stats.get(word, 0) + 1 124 lineCount += 1 132 lineCount += 1 125 charIndex += len(line) 133 charIndex += len(line) 126 trans.commit() 134 trans.commit() 135 trans = shakespeare.dm.Concordance._connection.transaction() 136 for word, value in stats.items(): 137 tresults = shakespeare.dm.Statistic.select( 138 sqlobject.AND( 139 shakespeare.dm.Statistic.q.textID == dmText.id, 140 shakespeare.dm.Statistic.q.word == word 141 )) 142 try: 143 dbstat = list(tresults)[0] 144 dbstat.occurrences += value 145 except: 146 shakespeare.dm.Statistic( 147 connection=trans, 148 text=dmText, 149 word=word, 150 occurrences=value 151 ) 152 trans.commit() 153 127 154 128 def remove_text(self, name): 155 def remove_text(self, name): 129 """Remove a text from the concordance. 156 """Remove a text from the concordance. 130 157 131 @param name: as for add_text 158 @param name: as for add_text 132 """ 159 """ 133 dmText = shakespeare.dm.Material.byName(name) 160 dmText = shakespeare.dm.Material.byName(name) 134 recs = shakespeare.dm.Concordance.select( 161 recs = shakespeare.dm.Concordance.select( 135 shakespeare.dm.Concordance.q.textID==dmText.id 162 shakespeare.dm.Concordance.q.textID==dmText.id 136 ) 163 ) 137 for rec in recs: 164 for rec in recs: 138 shakespeare.dm.Concordance.delete(rec.id) 165 shakespeare.dm.Concordance.delete(rec.id) 139 166 trunk/src/shakespeare/concordance_test.py
Revision 40 Revision 72 1 import unittest 1 import unittest 2 import StringIO 2 import StringIO 3 import tempfile 3 import tempfile 4 4 5 5 6 import shakespeare.index 6 import shakespeare.index 7 import shakespeare.concordance 7 import shakespeare.concordance 8 8 9 class TestConcordancer: 9 class TestConcordancer: 10 10 11 inText = \ 11 inText = \ 12 """A fake fake line 12 """A fake fake line 13 SUFFOLK. 13 SUFFOLK. 14 As by your high imperial Majesty 14 As by your high imperial Majesty 15 I had in charge at my depart for France, 15 I had in charge at my depart for France, 16 As procurator to your excellence, 16 As procurator to your excellence, 17 A fake imperial line. 17 """ 18 """ 18 name = 'test-concordance' 19 name = 'test-concordance' 19 title = 'Hamlet' 20 title = 'Hamlet' 20 21 21 # ['work_id', 'line-no', 'character-index'] } 22 # ['work_id', 'line-no', 'character-index'] } 23 # incomplete 22 expConcordance = { 24 expConcordance = { 23 'fake' : [ (name, 0, 2), (name, 0, 7) ],25 'fake' : [ (name, 0, 2), (name, 0, 7), (name, 5, 136) ], 24 'suffolk' : [ (name, 1, 17), ], 26 'suffolk' : [ (name, 1, 17), ], 25 'high' : [ (name, 2, 37), ], 27 'high' : [ (name, 2, 37), ], 26 'word_that_is_not_there' : [], 28 'word_that_is_not_there' : [], 27 } 29 } 28 30 31 # incomplete 29 expStats = { 32 expStats = { 30 'fake' : 2, 33 'fake' : 3, 34 'imperial' : 2, 31 'suffolk' : 1, 35 'suffolk' : 1, 32 'high' : 1, 36 'high' : 1, 33 'word_that_is_not_there' : 0, 37 'word_that_is_not_there' : 0, 34 } 38 } 35 39 36 def setup_class(cls): 40 def setup_class(cls): 37 cls.builder = shakespeare.concordance.ConcordanceBuilder() 41 cls.builder = shakespeare.concordance.ConcordanceBuilder() 38 # try deleting it first so as to be more robust to errors 42 # try deleting it first so as to be more robust to errors 39 # does not seem to work with the class methods 43 # does not seem to work with the class methods 40 # cls.teardown_class(cls) 44 # cls.teardown_class(cls) 41 cls.text = shakespeare.dm.Material(name=cls.name, title=cls.title) 45 cls.text = shakespeare.dm.Material(name=cls.name, title=cls.title) 42 cls.builder.add_text(cls.name, StringIO.StringIO(cls.inText)) 46 cls.builder.add_text(cls.name, StringIO.StringIO(cls.inText)) 43 cls.concordance = shakespeare.concordance.Concordance([cls.name]) 47 cls.concordance = shakespeare.concordance.Concordance([cls.name]) 44 cls.statistics = shakespeare.concordance.Statistics([cls.name]) 48 cls.statistics = shakespeare.concordance.Statistics([cls.name]) 45 49 46 def teardown_class(cls): 50 def teardown_class(cls): 47 # allow us to deal with left over stuff from previous errors 51 # allow us to deal with left over stuff from previous errors 48 try: 52 try: 49 cls.builder.remove_text(cls.name) 53 cls.builder.remove_text(cls.name) 50 tmp = shakespeare.dm.Material.byName(cls.name) 54 tmp = shakespeare.dm.Material.byName(cls.name) 51 shakespeare.dm.Material.delete(tmp.id) 55 shakespeare.dm.Material.delete(tmp.id) 52 except: 56 except: 53 pass 57 pass 54 58 55 def test__process_line(self): 59 def test__process_line(self): 56 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 60 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 57 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 61 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 58 out = self.builder.word_regex.findall(line) 62 out = self.builder.word_regex.findall(line) 59 assert exp == out 63 assert exp == out 60 64 61 def test_concordance(self): 65 def test_concordance(self): 62 for key, value in self.expConcordance.items(): 66 for key, value in self.expConcordance.items(): 63 listing = list(self.concordance.get(key)) 67 listing = list(self.concordance.get(key)) 64 listing.reverse()68 assert len(listing) == len(value) 65 out = [ (xx.text.name, xx.line, xx.char_index) for xx in listing ]69 for xx in listing: 66 assert out ==value70 assert (xx.text.name, xx.line, xx.char_index) in value 67 71 68 def test_stats(self): 72 def test_stats(self): 69 for key, value in self.expStats.items(): 73 for key, value in self.expStats.items(): 70 out = self.statistics.get(key) 74 out = self.statistics.get(key) 75 print key 71 assert out == value 76 assert out == value 72 77 73 def test_keys(self): 78 def test_keys(self): 74 words = self.concordance.keys() 79 words = self.concordance.keys() 75 assert 'a' == words[0] 80 assert 'a' == words[0] 76 assert 'your' == words[-1] 81 assert 'your' == words[-1] 77 assert 22 == len(words) 82 assert 22 == len(words) trunk/src/shakespeare/dm.py
Revision 51 Revision 72 1 """ 1 """ 2 Domain model 2 Domain model 3 3 4 Material contains all data we have including shakespeare texts. A text is taken 4 Material contains all data we have including shakespeare texts. A text is taken 5 to be a specific version of a work. e.g. the 1623 folio of King Richard III. 5 to be a specific version of a work. e.g. the 1623 folio of King Richard III. 6 6 7 We may in future add a Work object to refer to 'abstract' work of which a given 7 We may in future add a Work object to refer to 'abstract' work of which a given 8 text is a version. 8 text is a version. 9 """ 9 """ 10 import sqlobject 10 import sqlobject 11 11 12 import shakespeare 12 import shakespeare 13 import shakespeare.cache 13 import shakespeare.cache 14 14 15 uri = shakespeare.conf().get('db', 'uri') 15 uri = shakespeare.conf().get('db', 'uri') 16 __connection__ = sqlobject.connectionForURI(uri) 16 __connection__ = sqlobject.connectionForURI(uri) 17 17 18 # note we run this at bottom of module to auto create db tables on import 18 # note we run this at bottom of module to auto create db tables on import 19 def createdb(): 19 def createdb(): 20 Material.createTable(ifNotExists=True) 20 Material.createTable(ifNotExists=True) 21 Concordance.createTable(ifNotExists=True) 21 Concordance.createTable(ifNotExists=True) 22 Statistic.createTable(ifNotExists=True) 22 23 23 def cleandb(): 24 def cleandb(): 25 Statistic.dropTable(ifExists=True) 24 Concordance.dropTable(ifExists=True) 26 Concordance.dropTable(ifExists=True) 25 Material.dropTable(ifExists=True) 27 Material.dropTable(ifExists=True) 26 28 27 def rebuilddb(): 29 def rebuilddb(): 28 cleandb() 30 cleandb() 29 createdb() 31 createdb() 30 32 31 class Material(sqlobject.SQLObject): 33 class Material(sqlobject.SQLObject): 32 """Material related to Shakespeare (usually text of works and ancillary 34 """Material related to Shakespeare (usually text of works and ancillary 33 matter such as introductions). 35 matter such as introductions). 34 36 35 NB: can not use 'text' as class name as it is an sql reserved word 37 NB: can not use 'text' as class name as it is an sql reserved word 36 38 37 @attribute name: a unique name identifying the material 39 @attribute name: a unique name identifying the material 38 40 39 TODO: mutiple creators ?? 41 TODO: mutiple creators ?? 40 """ 42 """ 41 43 42 name = sqlobject.StringCol(alternateID=True) 44 name = sqlobject.StringCol(alternateID=True) 43 title = sqlobject.StringCol(default=None, length=255) 45 title = sqlobject.StringCol(default=None, length=255) 44 # creator rather than author to fit with dublin core 46 # creator rather than author to fit with dublin core 45 creator = sqlobject.StringCol(default=None, length=255) 47 creator = sqlobject.StringCol(default=None, length=255) 46 url = sqlobject.StringCol(default=None, length=255) 48 url = sqlobject.StringCol(default=None, length=255) 47 notes = sqlobject.StringCol(default=None) 49 notes = sqlobject.StringCol(default=None) 48 50 49 def get_cache_path(self, format): 51 def get_cache_path(self, format): 50 """Get path within cache to data file associated with this material. 52 """Get path within cache to data file associated with this material. 51 @format: the version ('plain', original='' etc) 53 @format: the version ('plain', original='' etc) 52 """ 54 """ 53 return shakespeare.cache.default.path(self.url, format) 55 return shakespeare.cache.default.path(self.url, format) 54 56 55 class Concordance(sqlobject.SQLObject): 57 class Concordance(sqlobject.SQLObject): 56 58 57 text = sqlobject.ForeignKey('Material') 59 text = sqlobject.ForeignKey('Material') 58 word = sqlobject.StringCol(length=50) 60 word = sqlobject.StringCol(length=50) 59 line = sqlobject.IntCol() 61 line = sqlobject.IntCol() 60 char_index = sqlobject.IntCol() 62 char_index = sqlobject.IntCol() 61 63 62 word_index = sqlobject.DatabaseIndex('word') 64 word_index = sqlobject.DatabaseIndex('word') 63 text_index = sqlobject.DatabaseIndex('text') 65 text_index = sqlobject.DatabaseIndex('text') 64 66 67 class Statistic(sqlobject.SQLObject): 68 69 text = sqlobject.ForeignKey('Material') 70 word = sqlobject.StringCol(length=50) 71 occurrences = sqlobject.IntCol(default=1) 72 73 word_index = sqlobject.DatabaseIndex('word') 74 text_index = sqlobject.DatabaseIndex('text') 75 65 76 66 # auto create db tables on import 77 # auto create db tables on import 67 createdb() 78 createdb() 68 79 trunk/src/shakespeare/dm_test.py
Revision 51 Revision 72 1 import sqlobject 2 1 import shakespeare.dm 3 import shakespeare.dm 2 4 3 class TestMaterial: 5 class TestMaterial: 4 6 5 def setup_class(self): 7 def setup_class(self): 6 self.name = 'test-123' 8 self.name = 'test-123' 7 self.title = 'Hamlet' 9 self.title = 'Hamlet' 8 self.url = 'http://www.openshakespeare.org/blah.txt' 10 self.url = 'http://www.openshakespeare.org/blah.txt' 9 self.text = shakespeare.dm.Material(name=self.name, 11 self.text = shakespeare.dm.Material(name=self.name, 10 title=self.title, url=self.url) 12 title=self.title, url=self.url) 11 13 12 def teardown_class(self): 14 def teardown_class(self): 13 shakespeare.dm.Material.delete(self.text.id) 15 shakespeare.dm.Material.delete(self.text.id) 14 16 15 def test1(self): 17 def test1(self): 16 txtid = self.text.id 18 txtid = self.text.id 17 txt2 = shakespeare.dm.Material.get(txtid) 19 txt2 = shakespeare.dm.Material.get(txtid) 18 txt3 = shakespeare.dm.Material.byName(self.name) 20 txt3 = shakespeare.dm.Material.byName(self.name) 19 assert self.text.id == txt2.id 21 assert self.text.id == txt2.id 20 assert self.text.id == txt3.id 22 assert self.text.id == txt3.id 21 23 22 def test_get_cache_path(self): 24 def test_get_cache_path(self): 23 out = self.text.get_cache_path('plain') 25 out = self.text.get_cache_path('plain') 24 # do not want anything too specific or we end up duplicating cache_test 26 # do not want anything too specific or we end up duplicating cache_test 25 assert len(out) > 0 27 assert len(out) > 0 26 28 27 class TestConcordance: 29 class TestConcordance: 28 30 29 def setup_class(self): 31 def setup_class(self): 30 self.name = 'test-123' 32 self.name = 'test-123' 31 self.title = 'Hamlet' 33 self.title = 'Hamlet' 32 self.text = shakespeare.dm.Material(name=self.name, title=self.title) 34 self.text = shakespeare.dm.Material(name=self.name, title=self.title) 33 word = 'jones' 35 word = 'jones' 34 line = 20 36 line = 20 35 char_index = 500 37 char_index = 500 36 self.cc1 = shakespeare.dm.Concordance(text=self.text, 38 self.cc1 = shakespeare.dm.Concordance(text=self.text, 37 word=word, 39 word=word, 38 line=line, 40 line=line, 39 char_index=char_index) 41 char_index=char_index) 40 42 41 def teardown_class(self): 43 def teardown_class(self): 42 shakespeare.dm.Concordance.delete(self.cc1.id) 44 shakespeare.dm.Concordance.delete(self.cc1.id) 43 shakespeare.dm.Material.delete(self.text.id) 45 shakespeare.dm.Material.delete(self.text.id) 44 46 45 def test1(self): 47 def test1(self): 46 out1 = shakespeare.dm.Concordance.get(self.cc1.id) 48 out1 = shakespeare.dm.Concordance.get(self.cc1.id) 47 assert self.text == out1.text 49 assert self.text == out1.text 48 50 51 class TestStatistic: 52 53 def setup_class(self): 54 self.name = 'test-123' 55 self.title = 'Hamlet' 56 self.text = shakespeare.dm.Material(name=self.name, title=self.title) 57 self.word = 'jones' 58 self.occurrences = 5 59 self.cc1 = shakespeare.dm.Statistic( 60 text=self.text, 61 word=self.word, 62 occurrences=self.occurrences 63 ) 64 65 def teardown_class(self): 66 shakespeare.dm.Statistic.delete(self.cc1.id) 67 shakespeare.dm.Material.delete(self.text.id) 68 69 def test1(self): 70 out1 = shakespeare.dm.Statistic.get(self.cc1.id) 71 assert self.text == out1.text 72 assert out1.occurrences == self.occurrences 73 74 def test_select(self): 75 tresults = shakespeare.dm.Statistic.select( 76 sqlobject.AND( 77 shakespeare.dm.Statistic.q.textID == self.text.id, 78 shakespeare.dm.Statistic.q.word == self.word, 79 )) 80 num = tresults.count() 81 assert num == 1 82
