Changeset 172
- Timestamp:
- 08/10/08 13:27:25 (4 months ago)
- Files:
-
- trunk/shakespeare/search.py (modified) (1 diff)
- trunk/shakespeare/tests/search_test.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/shakespeare/search.py
Revision 169 Revision 172 1 # Support for indexing and searching texts using xapian 1 '''Support for indexing and searching texts using xapian. 2 3 Architecture 4 ============ 5 6 For information on theoretical structure of Xapain see: 7 http://xapian.org/docs/intro_ir.html 8 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 10 11 For helpful example of using Xapian in python (including metadata, add_post 12 etc) see: 13 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 16 17 Here we discuss how we can use Xapian in OS. Two main tasks: 18 19 1. Do search 20 2. Produce statistics 21 22 Second task just requires stemming support, first requires full Xapian 23 facilities. Main question for indexing is: 24 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 26 * A whole poem or play 27 * Is it a paragraph within a work 28 * Is it a character's whole speech? 29 30 TODO: 31 * add metadata (e.g. which character is speaking, work id ...) 32 ''' 2 import os 33 import os 34 import re 3 35 4 import xapian 36 import xapian 5 37 6 class SearchIndex(object): 38 class SearchIndex(object): 7 def __init__(self, index_dir): 39 def __init__(self, index_dir): 8 self.index_dir = index_dir 40 self.index_dir = index_dir 9 41 10 @classmethod 42 @classmethod 11 def config_index_dir(self): 43 def config_index_dir(self): 12 '''Get the search index directory specified in the config.''' 44 '''Get the search index directory specified in the config.''' 13 import shakespeare 45 import shakespeare 14 conf = shakespeare.conf() 46 conf = shakespeare.conf() 15 index_dir = conf['search_index_dir'] 47 index_dir = conf['search_index_dir'] 16 return index_dir 48 return index_dir 17 49 18 @classmethod 50 @classmethod 19 def default_index(self): 51 def default_index(self): 20 '''Return a SearchIndex instance initialized with the path specified in 52 '''Return a SearchIndex instance initialized with the path specified in 21 the configuration file. 53 the configuration file. 22 ''' 54 ''' 23 index_dir = self.config_index_dir() 55 index_dir = self.config_index_dir() 24 if not os.path.exists(index_dir): 56 if not os.path.exists(index_dir): 25 os.makedirs(index_dir) 57 os.makedirs(index_dir) 26 return SearchIndex(index_dir) 58 return SearchIndex(index_dir) 27 59 60 @classmethod 61 def get_stats(self, fileobj): 62 '''Get statistics on text in fileobj. 63 64 Words are stemmed so that e.g. love and loved count as the same word. 65 ''' 66 # (?) maybe could use xapian.TermGenerator to split document 67 WORD_RE = re.compile('\\w{1,32}', re.U) 68 stemmer = xapian.Stem('english') 69 results = {} 70 text = fileobj.read() 71 text = text.encode('utf8') 72 for term in WORD_RE.finditer(text): 73 word = term.group() 74 word = word.lower() 75 stemmed_word = stemmer(word) 76 results[stemmed_word] = results.get(stemmed_word, 0) + 1 77 return results 78 28 def add_item(self, fileobj): 79 def add_item(self, fileobj): 29 # TODO: remove this comment as no longer relevant (?)30 #create the folder for a writable db: alter path31 document = xapian.WritableDatabase (self.index_dir, xapian.DB_CREATE_OR_OPEN) 80 document = xapian.WritableDatabase (self.index_dir, xapian.DB_CREATE_OR_OPEN) 32 indexer = xapian.TermGenerator() 81 indexer = xapian.TermGenerator() 33 stemmer = xapian.Stem("english") 82 stemmer = xapian.Stem("english") 34 indexer.set_stemmer(stemmer) 83 indexer.set_stemmer(stemmer) 35 84 36 para = '' 85 para = '' 37 try: 86 try: 38 for line in fileobj:87 for line in fileobj: 39 line = line.strip() 88 line = line.strip() 40 if line == '': 89 if line == '': 41 if para != '': 90 if para != '': 42 doc = xapian.Document() 91 doc = xapian.Document() 43 doc.set_data(para) 92 doc.set_data(para) 44 93 45 indexer.set_document(doc) 94 indexer.set_document(doc) 95 # this *will* include positional information 46 indexer.index_text(para) 96 indexer.index_text(para) 47 97 48 # Add the document to the database. 98 # Add the document to the database. 49 document.add_document(doc) 99 document.add_document(doc) 50 para = '' 100 para = '' 51 else: 101 else: 52 if para != '': 102 if para != '': 53 para += ' '103 para += '\n' 54 para += line 104 para += line 55 except StopIteration: 105 except StopIteration: 56 # TODO: what is happening here? 106 # TODO: what is happening here? 57 pass 107 pass 58 print Stopped59 108 60 def search(self, query_string): 109 def search(self, query_string): 61 # Open the database for searching. 110 # Open the database for searching. 62 database = xapian.Database(self.index_dir) 111 database = xapian.Database(self.index_dir) 63 112 64 # Start an enquire session. 113 # Start an enquire session. 65 enquire = xapian.Enquire(database) 114 enquire = xapian.Enquire(database) 66 115 67 # Parse the query string to produce a Xapian::Query object. 116 # Parse the query string to produce a Xapian::Query object. 68 qp = xapian.QueryParser() 117 qp = xapian.QueryParser() 69 stemmer = xapian.Stem("english") 118 stemmer = xapian.Stem("english") 70 qp.set_stemmer(stemmer) 119 qp.set_stemmer(stemmer) 71 qp.set_database(database) 120 qp.set_database(database) 72 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 121 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 73 query = qp.parse_query(query_string) 122 query = qp.parse_query(query_string) 74 print "Parsed query is: %s" % query.get_description() 123 print "Parsed query is: %s" % query.get_description() 75 124 76 # Find the top 10 results for the query. 125 # Find the top 10 results for the query. 77 enquire.set_query(query) 126 enquire.set_query(query) 78 matches = enquire.get_mset(0, 10) 127 # get search results offset, offset+count 128 offset = 0 129 count = 10 130 matches = enquire.get_mset(offset, count) 79 return matches 131 return matches 80 132 81 @classmethod 133 @classmethod 82 def print_matches(self, matches): 134 def print_matches(self, matches): 83 # Display the results. 135 # Display the results. 84 print "%i results found." % matches.get_matches_estimated() 136 print "%i results found." % matches.get_matches_estimated() 85 print "Results 1-%i:" % matches.size() 137 print "Results 1-%i:" % matches.size() 86 138 87 for m in matches: 139 for m in matches: 88 print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()) 140 print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()) 89 141 trunk/shakespeare/tests/search_test.py
Revision 169 Revision 172 1 import os 1 import os 2 import shutil 2 import shutil 3 import tempfile 3 import tempfile 4 import StringIO 4 import StringIO 5 5 6 import shakespeare.search 6 import shakespeare.search 7 7 8 class TestSearch: 8 class TestSearch: 9 # break up a little to make indexing more interesting 9 # break up a little to make indexing more interesting 10 text = \ 10 text = \ 11 ''' 11 ''' 12 Shall I compare thee to a summer's day? 12 Shall I compare thee to a summer's day? 13 Thou art more lovely and more temperate: 13 Thou art more lovely and more temperate: 14 Rough winds do shake the darling buds of May, 14 Rough winds do shake the darling buds of May, 15 And summer's lease hath all too short a date: 15 And summer's lease hath all too short a date: 16 16 17 Sometime too hot the eye of heaven shines, 17 Sometime too hot the eye of heaven shines, 18 And often is his gold complexion dimm'd, 18 And often is his gold complexion dimm'd, 19 And every fair from fair sometime declines, 19 And every fair from fair sometime declines, 20 By chance, or nature's changing course untrimm'd: 20 By chance, or nature's changing course untrimm'd: 21 21 22 But thy eternal summer shall not fade, 22 But thy eternal summer shall not fade, 23 Nor lose possession of that fair thou ow'st, 23 Nor lose possession of that fair thou ow'st, 24 Nor shall death brag thou wander'st in his shade, 24 Nor shall death brag thou wander'st in his shade, 25 When in eternal lines to time thou grow'st, 25 When in eternal lines to time thou grow'st, 26 26 27 So long as men can breathe, or eyes can see, 27 So long as men can breathe, or eyes can see, 28 So long lives this, and this gives life to thee. 28 So long lives this, and this gives life to thee. 29 ''' 29 ''' 30 30 31 def setUp(self): 31 def setUp(self): 32 basetmp = tempfile.gettempdir() 32 basetmp = tempfile.gettempdir() 33 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 33 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 34 # we leave directory in existence to help with debugging 34 # we leave directory in existence to help with debugging 35 if os.path.exists(self.tmpdir): 35 if os.path.exists(self.tmpdir): 36 shutil.rmtree(self.tmpdir) 36 shutil.rmtree(self.tmpdir) 37 os.makedirs(self.tmpdir) 37 os.makedirs(self.tmpdir) 38 self.index = shakespeare.search.SearchIndex(self.tmpdir) 38 self.index = shakespeare.search.SearchIndex(self.tmpdir) 39 39 40 def test_add_item(self): 40 def test_add_item(self): 41 self.index.add_item(StringIO.StringIO(self.text)) 41 self.index.add_item(StringIO.StringIO(self.text)) 42 42 43 def test_search(self): 43 def test_search(self): 44 self.index.add_item(StringIO.StringIO(self.text)) 44 self.index.add_item(StringIO.StringIO(self.text)) 45 out = self.index.search('summer') 45 out = self.index.search('summer') 46 assert len(out) == 2 46 assert len(out) == 2 47 mset1 = out[1] 48 # 'But thy eternal summer ... 49 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 50 assert mset1.document.get_data().startswith(exp) 47 out = self.index.search('rough') 51 out = self.index.search('rough') 48 assert len(out) == 1 52 assert len(out) == 1 49 53 54 def test_get_stats(self): 55 simpletext = 'Death death dead love loved loving' 56 out = self.index.get_stats(StringIO.StringIO(simpletext)) 57 assert len(out) == 3 58 assert out['love'] == 3 59 assert out['death'] == 2 60 assert out['dead'] == 1 61
