Changeset 74

Show
Ignore:
Timestamp:
12/27/06 18:02:06 (3 years ago)
Author:
rgrp
Message:

Improve ignoring of non words by the concordance/statistic builder and fix a bug introduced by changeset:72.

  • src/shakespeare/concordance.py:
    • add ignore_word function to replace simple test in add_text method and improve this using inter alia:
    • is_roman_numeral: new method to test whether a word is a roman numeral
    • non_words: attribute listing non-words
    • remove_text: (bugfix) was not removing associated Statistic only associated Concordance objects
  • src/shakespeare/concordance_test.py:
    • test_is_roman_numeral
    • test_ignore_word
Location:
trunk/src/shakespeare
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • trunk/src/shakespeare/concordance.py

    r72 r74  
    8989 
    9090    words_to_ignore = [  
    91         # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'd', 'in' 
     91        # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'in' 
    9292                        ] 
     93    non_words = [  
     94            'd', # accus'd 
     95            't', 
     96            ] 
     97 
     98    def is_roman_numeral(self, word): 
     99        digits = [ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix' ] 
     100        others = [ 'l', 'x', 'c' ] 
     101        if word == 'i': return False # exception because this conflicts with I 
     102        while word[0] in others: 
     103            if len(word) == 1: 
     104                return True 
     105            else: 
     106                word = word[1:] 
     107        return word in digits 
     108 
     109    def ignore_word(self, word): 
     110        "Return True if this word should not be added to the concordance." 
     111        bool1 = word in self.words_to_ignore 
     112        bool2 = word in self.non_words 
     113        # do roman numerals 
     114        bool3 = self.is_roman_numeral(word) 
     115        return bool1 or bool2 or bool3 
    93116 
    94117    def _text_already_done(self, text): 
     
    122145            for match in self.word_regex.finditer(line): 
    123146                word = match.group().lower() # case insensitive 
    124                 if word in self.words_to_ignore: 
     147                if self.ignore_word(word): 
    125148                    continue 
    126149                shakespeare.dm.Concordance(connection=trans, 
     
    164187        for rec in recs: 
    165188            shakespeare.dm.Concordance.delete(rec.id) 
     189        stats = shakespeare.dm.Statistic.select( 
     190                shakespeare.dm.Statistic.q.textID==dmText.id 
     191                ) 
     192        for stat in stats: 
     193            shakespeare.dm.Statistic.delete(stat.id) 
    166194 
  • trunk/src/shakespeare/concordance_test.py

    r72 r74  
    6363        assert exp == out 
    6464 
     65    def test_is_roman_numeral(self): 
     66        testvals = [ 'ii', 'v', 'vi', 'xi', 'xx', 'xxi', 'xlvi', 'c', 'cvi' ] 
     67        for val in testvals: 
     68            assert self.builder.is_roman_numeral(val) 
     69 
     70    def test_ignore_word(self): 
     71        testvals = [ 'd', 't' ] 
     72        for val in testvals: 
     73            assert self.builder.ignore_word(val) 
     74 
    6575    def test_concordance(self): 
    6676        for key, value in self.expConcordance.items():