Changeset 72

Show
Ignore:
Timestamp:
12/27/06 17:09:08 (3 years ago)
Author:
rgrp
Message:

Create Statistic domain object separate from the Concordance object to try and improve performance.

Generating concordance page is very slow. Tracked this down to (a) generation of the word list (which involved iterating over every item in the concordance table) (b) caculating statistics from concorance table using count(*).

By creating a dedicated Statistic object holding occurrences per text and word hoped to improve both of these. With respect to item (a) had a fairly good speed improvement from 23s to ~3s on my mac osx using the sonnets as the corpora. On (b) did not get much improvement as still have to do one db read per word and count(*) is pretty efficient (the cost using SQLObject is all the db reads not the original db query). On my local machine still takes ~ 30s to load the concordance page :( -- looks like caching the html may be the simplest way forward.

  • trunk/src/shakespeare/dm.py: add Statistic domain object
  • trunk/src/shakespeare/dm_test.py: add relevant tests
  • trunk/src/shakespeare/concordance.py:
    • ConcordanceBuilder?.add_text: adapted it to write values into Statistic object
    • ConcordanceBase?.keys(): use Statistic objects to get word lists rather than Concordance objects
    • Statistics.get: use Statistic rather than Concordance (ironically was simpler when using Concordance)
    • make various related changes (_sql_filter -> sqlcc_filter and sqlstat_filter etc)
  • trunk/src/shakespeare/concordance_test.py: no new tests but some minor fixes to old ones (*not* related to the other changes though)

Previously had a Statistics object in shakespeare.concordance

Location:
trunk/src/shakespeare
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • trunk/src/shakespeare/concordance.py

    r51 r72  
    2222    """ 
    2323    sqlcc = shakespeare.dm.Concordance 
     24    sqlstat = shakespeare.dm.Statistic 
    2425 
    2526    def __init__(self, filter_names=None): 
     
    2930        """ 
    3031        self._filter_names = filter_names 
    31         # piece of sql to use in select to filter texts 
    32         self._sql_filter = True 
     32        self.sqlcc_filter = self._make_filter(self.sqlcc) 
     33        self.sqlstat_filter = self._make_filter(self.sqlstat) 
     34 
     35    def _make_filter(self, sqlobj): 
     36        sql_filter = True 
    3337        if self._filter_names is not None: 
    3438            arglist = [] 
    3539            for name in self._filter_names: 
    36                 newarg = self.sqlcc.q.textID == self._name2id(name) 
     40                newarg = sqlobj.q.textID == self._name2id(name) 
    3741                arglist.append(newarg) 
    38             self._sql_filter = sqlobject.OR(*arglist) 
     42            sql_filter = sqlobject.OR(*arglist) 
     43        return sql_filter 
    3944     
    4045    def _name2id(self, name): 
     
    4247 
    4348    def keys(self): 
    44         """Return list of words in concordance 
     49        """Return list of *distinct* words in concordance/statistics 
    4550        """ 
    46         # distinct does not help us because we need to DISTINCT word 
    47         # but can't do this with sqlobject 
    48         all = self.sqlcc.select(self._sql_filter, 
    49                            orderBy=self.sqlcc.q.word, 
    50                            distinct=True) 
     51        all = self.sqlstat.select(self.sqlstat_filter, 
     52                           orderBy=self.sqlstat.q.word, 
     53                           ) 
    5154        words = [ xx.word for xx in list(all) ] 
    5255        distinct = list(set(words)) 
     
    6366        @return: sqlobject query list  
    6467        """ 
    65         select = self.sqlcc.select(sqlobject.AND(self._sql_filter, self.sqlcc.q.word==word)) 
     68        select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 
    6669        return select 
    6770 
     
    6972 
    7073    def get(self, word): 
    71         select = self.sqlcc.select( 
    72             sqlobject.AND(self._sql_filter, self.sqlcc.q.word==word) 
     74        select = self.sqlstat.select( 
     75            sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 
    7376            ) 
    74         return select.count() 
     77        total = 0 
     78        for stat in select: 
     79            total += stat.occurrences 
     80        return total 
    7581 
    7682class ConcordanceBuilder(object): 
     
    111117        lineCount = 0 
    112118        charIndex = 0 
     119        stats = {} 
    113120        trans = shakespeare.dm.Concordance._connection.transaction() 
    114121        for line in text.readlines(): 
     
    122129                                           line=lineCount, 
    123130                                           char_index=charIndex+match.start()) 
     131                stats[word] = stats.get(word, 0) + 1 
    124132            lineCount += 1 
    125133            charIndex += len(line) 
    126134        trans.commit() 
     135        trans = shakespeare.dm.Concordance._connection.transaction() 
     136        for word, value in stats.items(): 
     137            tresults  = shakespeare.dm.Statistic.select( 
     138                sqlobject.AND( 
     139                    shakespeare.dm.Statistic.q.textID == dmText.id, 
     140                    shakespeare.dm.Statistic.q.word == word 
     141                    )) 
     142            try: 
     143                dbstat = list(tresults)[0] 
     144                dbstat.occurrences += value 
     145            except: 
     146                shakespeare.dm.Statistic( 
     147                        connection=trans, 
     148                        text=dmText, 
     149                        word=word, 
     150                        occurrences=value 
     151                        ) 
     152        trans.commit() 
     153 
    127154 
    128155    def remove_text(self, name): 
  • trunk/src/shakespeare/concordance_test.py

    r40 r72  
    1515I had in charge at my depart for France, 
    1616As procurator to your excellence, 
     17A fake imperial line. 
    1718""" 
    1819    name = 'test-concordance' 
     
    2021     
    2122    # ['work_id', 'line-no', 'character-index'] } 
     23    # incomplete 
    2224    expConcordance = { 
    23         'fake' : [ (name, 0, 2), (name, 0, 7) ], 
     25        'fake' : [ (name, 0, 2), (name, 0, 7), (name, 5, 136) ], 
    2426        'suffolk' : [ (name, 1, 17), ], 
    2527        'high' : [ (name, 2, 37), ], 
     
    2729        } 
    2830 
     31    # incomplete 
    2932    expStats = { 
    30         'fake' : 2, 
     33        'fake' : 3, 
     34        'imperial' : 2, 
    3135        'suffolk' : 1, 
    3236        'high' : 1, 
     
    6266        for key, value in self.expConcordance.items(): 
    6367            listing = list(self.concordance.get(key)) 
    64             listing.reverse() 
    65             out = [ (xx.text.name, xx.line, xx.char_index) for xx in listing ] 
    66             assert out == value 
     68            assert len(listing) == len(value) 
     69            for xx in listing: 
     70                assert (xx.text.name, xx.line, xx.char_index) in value 
    6771 
    6872    def test_stats(self): 
    6973        for key, value in self.expStats.items(): 
    7074            out = self.statistics.get(key) 
     75            print key 
    7176            assert out == value 
    7277 
  • trunk/src/shakespeare/dm.py

    r51 r72  
    2020    Material.createTable(ifNotExists=True) 
    2121    Concordance.createTable(ifNotExists=True) 
     22    Statistic.createTable(ifNotExists=True) 
    2223 
    2324def cleandb(): 
     25    Statistic.dropTable(ifExists=True) 
    2426    Concordance.dropTable(ifExists=True) 
    2527    Material.dropTable(ifExists=True) 
     
    6365    text_index = sqlobject.DatabaseIndex('text') 
    6466 
     67class Statistic(sqlobject.SQLObject): 
     68 
     69    text = sqlobject.ForeignKey('Material') 
     70    word = sqlobject.StringCol(length=50) 
     71    occurrences = sqlobject.IntCol(default=1) 
     72 
     73    word_index = sqlobject.DatabaseIndex('word') 
     74    text_index = sqlobject.DatabaseIndex('text') 
     75 
    6576 
    6677# auto create db tables on import 
  • trunk/src/shakespeare/dm_test.py

    r51 r72  
     1import sqlobject 
     2 
    13import shakespeare.dm 
    24 
     
    4749        assert self.text == out1.text 
    4850 
     51class TestStatistic: 
     52 
     53    def setup_class(self): 
     54        self.name = 'test-123' 
     55        self.title = 'Hamlet' 
     56        self.text = shakespeare.dm.Material(name=self.name, title=self.title) 
     57        self.word = 'jones' 
     58        self.occurrences = 5 
     59        self.cc1 = shakespeare.dm.Statistic( 
     60                text=self.text, 
     61                word=self.word, 
     62                occurrences=self.occurrences 
     63                ) 
     64 
     65    def teardown_class(self): 
     66        shakespeare.dm.Statistic.delete(self.cc1.id) 
     67        shakespeare.dm.Material.delete(self.text.id) 
     68 
     69    def test1(self): 
     70        out1 = shakespeare.dm.Statistic.get(self.cc1.id) 
     71        assert self.text == out1.text 
     72        assert out1.occurrences == self.occurrences 
     73 
     74    def test_select(self): 
     75        tresults  = shakespeare.dm.Statistic.select( 
     76            sqlobject.AND( 
     77                shakespeare.dm.Statistic.q.textID == self.text.id, 
     78                shakespeare.dm.Statistic.q.word == self.word, 
     79                )) 
     80        num = tresults.count() 
     81        assert num == 1 
     82