Changeset 2
- Timestamp:
- 05/09/06 20:01:41 (3 years ago)
- Files:
-
- trunk/bin/shakespeare-admin (modified) (1 diff)
- trunk/src/shakespeare/cherrypy_handler.py (modified) (1 diff)
- trunk/src/shakespeare/concordancer.py (modified) (1 diff)
- trunk/src/shakespeare/concordancer_test.py (modified) (1 diff)
- trunk/src/shakespeare/format.py (modified) (1 diff)
- trunk/src/shakespeare/format_test.py (modified) (1 diff)
- trunk/src/shakespeare/template/concordance.html (added)
- trunk/src/shakespeare/utils.py (modified) (1 diff)
- trunk/src/shakespeare/work.py (moved) (moved from trunk/src/shakespeare/download.py) (1 diff)
- trunk/src/shakespeare/work_test.py (moved) (moved from trunk/src/shakespeare/download_test.py) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/bin/shakespeare-admin
Revision 1 Revision 2 1 #!/usr/bin/env python 1 #!/usr/bin/env python 2 2 3 import cmd 3 import cmd 4 import StringIO 4 import StringIO 5 5 6 from shakespeare.format import GutenbergShakespeare 6 from shakespeare.format import GutenbergShakespeare 7 from shakespeare.download import download_all_shakespeare8 from shakespeare.download import make_index9 from shakespeare.concordancer import make_concordancer 7 from shakespeare.concordancer import make_concordancer 10 import shakespeare.utils as utils 8 import shakespeare.utils as utils 9 import shakespeare.work 11 10 12 class ShakespeareAdmin(cmd.Cmd): 11 class ShakespeareAdmin(cmd.Cmd): 13 12 14 def __init__(self): 13 def __init__(self): 15 cmd.Cmd.__init__(self) # cmd.Cmd is not a new style class 14 cmd.Cmd.__init__(self) # cmd.Cmd is not a new style class 16 self._index = make_index()15 self._index = shakespeare.work.index.all 17 16 18 def do_format(self, line=None): 17 def do_format(self, line=None): 19 path = sys.argv[1] 18 path = sys.argv[1] 20 x = GutenbergShakespeare(file(path)) 19 x = GutenbergShakespeare(file(path)) 21 print x.extract_text() 20 print x.extract_text() 22 21 23 def help_format(self, line=None): 22 def help_format(self, line=None): 24 usage = \ 23 usage = \ 25 '''Format a raw gutenberg text. 24 '''Format a raw gutenberg text. 26 25 27 Take a raw gutenberg text in and return (on stdout) the core text (i.e. strip 26 Take a raw gutenberg text in and return (on stdout) the core text (i.e. strip 28 out all the gutenberg bumpf)''' 27 out all the gutenberg bumpf)''' 29 print usage 28 print usage 30 29 31 def do_format_all(self, line): 30 def do_format_all(self, line): 32 index = self._index 31 index = self._index 33 for item in index: 32 for item in index: 34 url = item[1] 33 url = item[1] 35 src = utils.get_local_path(url) 34 src = utils.get_local_path(url) 36 dest = utils.get_local_path(url, 'cleaned') 35 dest = utils.get_local_path(url, 'cleaned') 37 infile = file(src) 36 infile = file(src) 38 if src.endswith('wssnt10.txt'): # if it is the sonnets need a hack 37 if src.endswith('wssnt10.txt'): # if it is the sonnets need a hack 39 # delete last 140 characters 38 # delete last 140 characters 40 tmp1 = infile.read() 39 tmp1 = infile.read() 41 infile = StringIO.StringIO(tmp1[:-120]) 40 infile = StringIO.StringIO(tmp1[:-120]) 42 formatter = GutenbergShakespeare(infile) 41 formatter = GutenbergShakespeare(infile) 43 ff = file(dest, 'w') 42 ff = file(dest, 'w') 44 out = formatter.extract_text() 43 out = formatter.extract_text() 45 ff.write(out) 44 ff.write(out) 46 ff.close() 45 ff.close() 47 46 48 def do_download_texts(self, line): 47 def do_download_texts(self, line): 49 download_all_shakespeare() 48 for item in self._index: 49 utils.download_url(item[1]) 50 50 51 def help_download_texts(self, line=None): 51 def help_download_texts(self, line=None): 52 print download_all_shakespeare.__doc__ 52 usage = \ 53 """ 54 Download from Project Gutenberg all the shakespeare texts listed in the index.""" 55 print usage 53 56 54 def do_print_index(self, line): 57 def do_print_index(self, line): 55 for row in self._index: 58 for row in self._index: 56 print row 59 print row 57 60 58 def help_print_index(self, line=None): 61 def help_print_index(self, line=None): 59 usage = \ 62 usage = \ 60 '''Print index of Shakespeare texts to stdout''' 63 '''Print index of Shakespeare texts to stdout''' 61 print usage 64 print usage 62 65 63 def do_make_concordancer(self, line=None): 66 def do_make_concordancer(self, line=None): 64 make_concordancer() 67 if line is not None: 68 textsToAdd = [] 69 textsUrls = line.split() 70 for item in self._index: 71 if item[1] in textsUrls: 72 textsToAdd.append(item) 73 make_concordancer(textsToAdd) 74 else: 75 make_concordancer() 65 76 66 def help_make_concordancer(self, line=None): 77 def help_make_concordancer(self, line=None): 67 print make_concordance.__doc__ 78 print make_concordance.__doc__ 68 79 69 def do_help(self, line=None): 80 def do_help(self, line=None): 70 cmd.Cmd.do_help(self, line) 81 cmd.Cmd.do_help(self, line) 71 82 72 def do_quit(self): 83 def do_quit(self): 73 sys.exit() 84 sys.exit() 74 85 75 def do_EOF(self, *args): 86 def do_EOF(self, *args): 76 print '' 87 print '' 77 sys.exit() 88 sys.exit() 78 89 79 if __name__ == '__main__': 90 if __name__ == '__main__': 80 import sys 91 import sys 81 usage = """ 92 usage = """ 82 %prog cmd 93 %prog cmd 83 94 84 format-all: format all gutenberg etexts automatically 95 format-all: format all gutenberg etexts automatically 85 download: download the gutenberg etexts and store them in the cache 96 download: download the gutenberg etexts and store them in the cache 86 """ 97 """ 87 adminCmd = ShakespeareAdmin() 98 adminCmd = ShakespeareAdmin() 88 if len(sys.argv) < 2: 99 if len(sys.argv) < 2: 89 while 1: 100 while 1: 90 try: 101 try: 91 adminCmd.cmdloop() 102 adminCmd.cmdloop() 92 break 103 break 93 except KeyboardInterrupt: 104 except KeyboardInterrupt: 94 raise 105 raise 95 else: 106 else: 96 args = ' '.join(sys.argv[1:]) 107 args = ' '.join(sys.argv[1:]) 97 args = args.replace('-','_') 108 args = args.replace('-','_') 98 adminCmd.onecmd(args) 109 adminCmd.onecmd(args) trunk/src/shakespeare/cherrypy_handler.py
Revision 1 Revision 2 1 """ 1 """ 2 Tutorial - Passing variables 2 Web interface to view and analyze shakespeare texts. 3 4 This tutorial shows you how to pass GET/POST variables to methods. 5 """ 3 """ 6 import cherrypy 4 import cherrypy 7 import os 5 import os 8 6 9 from shakespeare.download import make_index 7 import shakespeare.work 10 index = make_index()8 index = shakespeare.work.index.all 11 from shakespeare.utils import get_local_path 9 from shakespeare.utils import get_local_path 12 import shakespeare.format 10 import shakespeare.format 13 11 14 import shakespeare.concordancer 12 import shakespeare.concordancer 15 c oncordancer= shakespeare.concordancer.get_concordancer()13 cc = shakespeare.concordancer.get_concordancer() 16 14 17 class WelcomePage: 15 class WelcomePage: 18 16 19 def index(self): 17 def index(self): 20 try: 18 try: 21 import kid 19 import kid 22 kid.enable_import(suffixes=[".html"]) 20 kid.enable_import(suffixes=[".html"]) 23 import shakespeare.template.index 21 import shakespeare.template.index 24 template = shakespeare.template.index.Template(works_index=index) 22 template = shakespeare.template.index.Template(works_index=index) 25 result = str(template) 23 result = str(template) 26 # result = 'test' 24 # result = 'test' 27 return result 25 return result 28 except Exception, inst: 26 except Exception, inst: 29 return '<p><strong>There was an error: ' + str(inst) + '</strong></p>' 27 return '<p><strong>There was an error: ' + str(inst) + '</strong></p>' 30 index.exposed = True 28 index.exposed = True 31 29 32 def view(self, text_url=None, version='cleaned', format='plain'): 30 def view(self, text_url=None, version='cleaned', format='plain'): 33 localPath = get_local_path(text_url, version) 31 localPath = get_local_path(text_url, version) 34 ff = file(localPath) 32 ff = file(localPath) 35 if format == 'plain': 33 if format == 'plain': 36 result = '<pre>' + ff.read() + '</pre>' 34 result = '<pre>' + ff.read() + '</pre>' 37 else: 35 else: 38 formatter = shakespeare.format.TextFormatter(ff) 36 formatter = shakespeare.format.TextFormatter(ff) 39 result = formatter.format(format) 37 result = formatter.format(format) 40 # import kid 38 # import kid 41 # kid.enable_import(suffixes=['.html']) 39 # kid.enable_import(suffixes=['.html']) 42 # module = __import__('shakespeare.template.format_' + format, '', '', '*') 40 # module = __import__('shakespeare.template.format_' + format, '', '', '*') 43 # template = module.Template(fileobj=ff) 41 # template = module.Template(fileobj=ff) 44 # result = template.serialize() 42 # result = template.serialize() 45 ff.close() 43 ff.close() 46 return result 44 return result 47 view.exposed = True 45 view.exposed = True 48 46 49 # def concordance(self): 47 def concordance(self): 50 # import kid 48 import kid 51 # kid.enable_import(suffixes=[".html"]) 49 kid.enable_import(suffixes=[".html"]) 52 # import shakespeare.template.concordance 50 import shakespeare.template.concordance 53 # template = shakespeare.template.concordance.Template(concordancer=concordancer) 51 concordance = cc.concordance 54 # result = template.serialize() 52 words = concordance.keys() 55 # return result 53 words.sort() 56 # concordance.exposed = True 54 template = shakespeare.template.concordance.Template(words=words, stats=cc.stats) 55 result = template.serialize() 56 # result = str(cc) 57 return result 58 concordance.exposed = True 57 59 58 60 59 cherrypy.root = WelcomePage() 61 cherrypy.root = WelcomePage() 60 62 61 if __name__ == '__main__': 63 if __name__ == '__main__': 62 cherrypy.lowercase_api = True 64 cherrypy.lowercase_api = True 63 # cherrypy.config.update(file = 'tutorial.conf') 65 # cherrypy.config.update(file = 'tutorial.conf') 64 cherrypy.config.update({'server.showTracebacks' : True }) 66 cherrypy.config.update({'server.showTracebacks' : True }) 65 cherrypy.server.start() 67 cherrypy.server.start() 66 68 67 """ 69 """ 68 [global] 70 [global] 69 server.socketPort = 8080 71 server.socketPort = 8080 70 server.threadPool = 10 72 server.threadPool = 10 71 server.environment = "production" 73 server.environment = "production" 72 # server.showTracebacks = True 74 # server.showTracebacks = True 73 # server.logToScreen = False 75 # server.logToScreen = False 74 """ 76 """ trunk/src/shakespeare/concordancer.py
Revision 1 Revision 2 1 import re 1 import re 2 import cPickle 2 import cPickle 3 3 4 import utils 4 import utils 5 from download import make_index 5 import shakespeare.work 6 6 7 def make_concordancer(showProgress=True): 7 def make_concordancer( 8 texts_to_add=shakespeare.work.index.all, 9 out_path=utils.get_local_path('concordance.p'), 10 ): 8 """Create Concordancer object and use it to produce concordance and stats 11 """Create Concordancer object and use it to produce concordance and stats 9 for all non-folio works. 12 for all non-folio works. 10 Save resulting object in pickled form to 'concordance.p'. 13 @out_path: where to save the concordance 14 @texts_to_add: index items that should be added to the concordance 11 """ 15 """ 12 def _print(msg):13 if showProgress:14 print(msg)15 index = make_index()16 cc = Concordancer() 16 cc = Concordancer() 17 for item in index:17 for item in texts_to_add: 18 url = item[1] 18 url = item[1] 19 isfolio = item[2] == 'folio' 19 isfolio = item[2] == 'folio' 20 src = utils.get_local_path(url, 'cleaned') 20 src = utils.get_local_path(url, 'cleaned') 21 if isfolio: 21 cc.add_text(file(src)) 22 _print('Is folio so skipping [%s]' % src) 22 ccFile = file(out_path, 'w') 23 else: 24 _print('Adding text [%s]' % src) 25 cc.add_text(file(src)) 26 filePath = utils.get_local_path('concordance.p') 27 ccFile = file(filePath, 'w') 28 cPickle.dump(cc, ccFile) 23 cPickle.dump(cc, ccFile) 29 24 30 def get_concordancer(): 25 def get_concordancer(): 31 """Get a concordancer containing concordance and stats by unpickling cached 26 """Get a concordancer containing concordance and stats by unpickling cached 32 copy. 27 copy. 33 """ 28 """ 34 filePath = utils.get_local_path('concordance.p') 29 filePath = utils.get_local_path('concordance.p') 35 cc = cPickle.load(file(filePath)) 30 cc = cPickle.load(file(filePath)) 36 return cc 31 return cc 37 32 38 class Concordancer(object): 33 class Concordancer(object): 39 """Generate a concordance and associated statistics for a set of texts. 34 """Generate a concordance and associated statistics for a set of texts. 40 35 41 Concordance and statistics are provided as dictionaries keyed by words. 36 Concordance and statistics are provided as dictionaries keyed by words. 42 NB: all word keys have been lower-cased in order to render them case-insensitive 37 NB: all word keys have been lower-cased in order to render them case-insensitive 43 """ 38 """ 44 39 45 # multiline, unicode and ignorecase 40 # multiline, unicode and ignorecase 46 wordRegex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 41 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 42 43 words_to_ignore = [ 'a', 'the', 'and', 44 'as', 'are', 'be', 45 'but', 'd', 'in' 46 ] 47 47 48 def __init__(self): 48 def __init__(self): 49 self.concordance = {} 49 self.concordance = {} 50 self.stats = {} 50 self.stats = {} 51 51 52 def add_text(self, text, textId=None): 52 def add_text(self, text, textId=None): 53 """Add a text to the concordance. 53 """Add a text to the concordance. 54 @text: file like object containing text to add 54 @text: file like object containing text to add 55 """ 55 """ 56 lineCount = 0 56 lineCount = 0 57 charIndex = 0 57 charIndex = 0 58 for line in text.readlines(): 58 for line in text.readlines(): 59 for match in self.word Regex.finditer(line):59 for match in self.word_regex.finditer(line): 60 word = match.group().lower() # case insensitive 60 word = match.group().lower() # case insensitive 61 if word in self.words_to_ignore: 62 continue 61 oldValue = self.concordance.get(word, []) 63 oldValue = self.concordance.get(word, []) 62 oldStat = self.stats.get(word, 0) 64 oldStat = self.stats.get(word, 0) 63 oldValue.append( (lineCount, charIndex + match.start()) ) 65 tloc = (textId, lineCount, charIndex + match.start()) 66 oldValue.append(tloc) 64 self.concordance[word] = oldValue 67 self.concordance[word] = oldValue 65 self.stats[word] = oldStat + 1 68 self.stats[word] = oldStat + 1 66 lineCount += 1 69 lineCount += 1 67 charIndex += len(line) 70 charIndex += len(line) 68 71 69 trunk/src/shakespeare/concordancer_test.py
Revision 1 Revision 2 1 import unittest 1 import unittest 2 import StringIO 2 import StringIO 3 3 4 import concordancer 4 import concordancer 5 5 6 def test_suite(): 6 def test_suite(): 7 suites = [ 7 suites = [ 8 unittest.makeSuite(ConcordancerTest), 8 unittest.makeSuite(ConcordancerTest), 9 ] 9 ] 10 return unittest.TestSuite(suites) 10 return unittest.TestSuite(suites) 11 11 12 class ConcordancerTest(unittest.TestCase): 12 class ConcordancerTest(unittest.TestCase): 13 13 14 inText = \ 14 inText = \ 15 """A fake fake line 15 """A fake fake line 16 SUFFOLK. 16 SUFFOLK. 17 As by your high imperial Majesty 17 As by your high imperial Majesty 18 I had in charge at my depart for France, 18 I had in charge at my depart for France, 19 As procurator to your excellence, 19 As procurator to your excellence, 20 """ 20 """ 21 textId = 1 21 22 22 # ['work_id', 'line-no', 'character-index'] } 23 # ['work_id', 'line-no', 'character-index'] } 23 expConcordance = { 24 expConcordance = { 24 'fake' : [ ( 0, 2), (0, 7) ],25 'fake' : [ (textId, 0, 2), (textId, 0, 7) ], 25 'suffolk' : [ ( 1, 17), ],26 'suffolk' : [ (textId, 1, 17), ], 26 'high' : [ ( 2, 37), ],27 'high' : [ (textId, 2, 37), ], 27 } 28 } 28 29 29 expStats = { 30 expStats = { 30 'fake' : 2, 31 'fake' : 2, 31 'suffolk' : 1, 32 'suffolk' : 1, 32 'high' : 1, 33 'high' : 1, 33 } 34 } 34 35 35 def setUp(self): 36 def setUp(self): 36 self.cc = concordancer.Concordancer() 37 self.cc = concordancer.Concordancer() 37 self.cc.add_text(StringIO.StringIO(self.inText), 'King Henry VI')38 self.cc.add_text(StringIO.StringIO(self.inText), self.textId) 38 39 39 def test__process_line(self): 40 def test__process_line(self): 40 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 41 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 41 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 42 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 42 out = self.cc.word Regex.findall(line)43 out = self.cc.word_regex.findall(line) 43 self.assertEqual(exp, out) 44 self.assertEqual(exp, out) 44 45 45 def test_concordance(self): 46 def test_concordance(self): 46 for key, value in self.expConcordance.items(): 47 for key, value in self.expConcordance.items(): 47 out = self.cc.concordance[key] 48 out = self.cc.concordance[key] 48 self.assertEqual(out, value) 49 self.assertEqual(out, value) 49 50 50 def test_stats(self): 51 def test_stats(self): 51 for key, value in self.expStats.items(): 52 for key, value in self.expStats.items(): 52 out = self.cc.stats[key] 53 out = self.cc.stats[key] 53 self.assertEqual(out, value) 54 self.assertEqual(out, value) 55 56 def test_make_concordancer(self): 57 import tempfile 58 filePath = tempfile.mkstemp()[1] 59 import shakespeare.work 60 index = shakespeare.work.index.all 61 concordancer.make_concordancer(index[2:3], filePath) trunk/src/shakespeare/format.py
Revision 1 Revision 2 1 """ 1 """ 2 Clean up Gutenberg texts by removing all the header and footer bumpf 2 Clean up Gutenberg texts by removing all the header and footer bumpf 3 """ 3 """ 4 4 5 import re 5 import re 6 import download7 6 8 headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 7 headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 9 notesStartPhrases = ["Executive Director's Notes:"] 8 notesStartPhrases = ["Executive Director's Notes:"] 10 notesEndPhrases = ['David Reed'] 9 notesEndPhrases = ['David Reed'] 11 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 10 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 12 ] 11 ] 13 12 14 def make_re_from_phrase(phrase): 13 def make_re_from_phrase(phrase): 15 """ 14 """ 16 Make a regular expression that matches a phrase and its surrounding 15 Make a regular expression that matches a phrase and its surrounding 17 paragraph, i.e. that look like: 16 paragraph, i.e. that look like: 18 17 19 ... phrase .... 18 ... phrase .... 20 more text 19 more text 21 [blank] 20 [blank] 22 [blank]+ 21 [blank]+ 23 """ 22 """ 24 paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 23 paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 25 # [[TODO: check slowdown due to inclusion of '^.*' at start 24 # [[TODO: check slowdown due to inclusion of '^.*' at start 26 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 25 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 27 return re.compile(tmp, re.I | re.M) # make it case insensitive 26 return re.compile(tmp, re.I | re.M) # make it case insensitive 28 27 29 class GutenbergShakespeare(object): 28 class GutenbergShakespeare(object): 30 """ 29 """ 31 Process Gutenberg shakespeare texts 30 Process Gutenberg shakespeare texts 32 """ 31 """ 33 32 34 def __init__(self, etext): 33 def __init__(self, etext): 35 """ 34 """ 36 @param etext: file like object containing the etext 35 @param etext: file like object containing the etext 37 36 38 Procedure: 37 Procedure: 39 1. strip out header and footer bumpf 38 1. strip out header and footer bumpf 40 2. are there notes? If so strip them out 39 2. are there notes? If so strip them out 41 """ 40 """ 42 self.etext = etext 41 self.etext = etext 43 self.etextStr = self.etext.read() 42 self.etextStr = self.etext.read() 44 # normalize the line endings to save us grief later 43 # normalize the line endings to save us grief later 45 self.etextStr = self.etextStr.replace('\r\n', '\n') 44 self.etextStr = self.etextStr.replace('\r\n', '\n') 46 self.hasNotes = False 45 self.hasNotes = False 47 46 48 def _find_max(self, phrase, string): 47 def _find_max(self, phrase, string): 49 maxIndex = 0 48 maxIndex = 0 50 regex = make_re_from_phrase(phrase) 49 regex = make_re_from_phrase(phrase) 51 matches = regex.finditer(string) 50 matches = regex.finditer(string) 52 for match in matches: 51 for match in matches: 53 maxIndex = max(match.end(), maxIndex) 52 maxIndex = max(match.end(), maxIndex) 54 return maxIndex 53 return maxIndex 55 54 56 def _find_min(self, phrase, string): 55 def _find_min(self, phrase, string): 57 minIndex = len(string) 56 minIndex = len(string) 58 regex = make_re_from_phrase(phrase) 57 regex = make_re_from_phrase(phrase) 59 matches = regex.finditer(string) 58 matches = regex.finditer(string) 60 for match in matches: 59 for match in matches: 61 minIndex = min(match.start(), minIndex) 60 minIndex = min(match.start(), minIndex) 62 return minIndex 61 return minIndex 63 62 64 def extract_text(self): 63 def extract_text(self): 65 """Extract the core text. 64 """Extract the core text. 66 """ 65 """ 67 self.notesEnd = self.get_notes_end() 66 self.notesEnd = self.get_notes_end() 68 self.headerEnd = self.get_header_end() 67 self.headerEnd = self.get_header_end() 69 self.footerStart = self.get_footer_start() 68 self.footerStart = self.get_footer_start() 70 startIndex = self.headerEnd 69 startIndex = self.headerEnd 71 if self.notesEnd > 0: 70 if self.notesEnd > 0: 72 startIndex = self.notesEnd 71 startIndex = self.notesEnd 73 return self.etextStr[startIndex : self.footerStart].rstrip() 72 return self.etextStr[startIndex : self.footerStart].rstrip() 74 73 75 def get_notes_end(self): 74 def get_notes_end(self): 76 "Return 0 if no notes" 75 "Return 0 if no notes" 77 indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 76 indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 78 index = max(indices) 77 index = max(indices) 79 return index 78 return index 80 79 81 def get_header_end(self): 80 def get_header_end(self): 82 indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 81 indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 83 return max(indices) 82 return max(indices) 84 83 85 def get_footer_start(self): 84 def get_footer_start(self): 86 indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 85 indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 87 return min(indices) 86 return min(indices) 88 87 89 class TextFormatter(object): 88 class TextFormatter(object): 90 """Format a provided text in a variety of ways. 89 """Format a provided text in a variety of ways. 91 For example: add line numbers, convert to html with line ids etc 90 For example: add line numbers, convert to html with line ids etc 92 """ 91 """ 93 92 94 def __init__(self, file): 93 def __init__(self, file): 95 """ 94 """ 96 @file: file-like object containing a text in plain txt 95 @file: file-like object containing a text in plain txt 97 """ 96 """ 98 self.file = file 97 self.file = file 99 98 100 def format(self, format): 99 def format(self, format): 101 """ 100 """ 102 @format: the name specifying the format to use 101 @format: the name specifying the format to use 103 """ 102 """ 104 if format == 'lineno': 103 if format == 'lineno': 105 return self.add_line_numbers() 104 return self.add_line_numbers() 106 else: 105 else: 107 raise ValueError('Unknown format: %s' % format) 106 raise ValueError('Unknown format: %s' % format) 108 107 109 def add_line_numbers(self): 108 def add_line_numbers(self): 110 result = '' 109 result = '' 111 count = 0 110 count = 0 112 for line in self.file.readlines(): 111 for line in self.file.readlines(): 113 tlineno = str(count).ljust(4) # assume line no < 10000 112 tlineno = str(count).ljust(4) # assume line no < 10000 114 result += '<pre id="%s">%s %s</pre>\n' % (count, tlineno, line.rstrip()) 113 result += '<pre id="%s">%s %s</pre>\n' % (count, tlineno, line.rstrip()) 115 count += 1 114 count += 1 116 return result 115 return result trunk/src/shakespeare/format_test.py
Revision 1 Revision 2 1 import unittest 1 import unittest 2 2 3 import download3 import utils 4 from format import make_re_from_phrase, GutenbergShakespeare 4 from format import make_re_from_phrase, GutenbergShakespeare 5 5 6 def test_suite(): 6 def test_suite(): 7 suites = [ 7 suites = [ 8 unittest.makeSuite(FormatTest), 8 unittest.makeSuite(FormatTest), 9 unittest.makeSuite(GutenbergShakespeareTest), 9 unittest.makeSuite(GutenbergShakespeareTest), 10 ] 10 ] 11 return unittest.TestSuite(suites) 11 return unittest.TestSuite(suites) 12 12 13 class FormatTest(unittest.TestCase): 13 class FormatTest(unittest.TestCase): 14 14 15 def test_make_re_from_phrase(self): 15 def test_make_re_from_phrase(self): 16 outStr = """blah 16 outStr = """blah 17 17 18 18 19 """ 19 """ 20 inStr = outStr + 'All is Well that' 20 inStr = outStr + 'All is Well that' 21 regex = make_re_from_phrase('blah') 21 regex = make_re_from_phrase('blah') 22 out = regex.search(inStr) 22 out = regex.search(inStr) 23 self.assertEquals(out.group(), outStr) 23 self.assertEquals(out.group(), outStr) 24 24 25 def test_makeReFromPhrase2(self): 25 def test_makeReFromPhrase2(self): 26 outStr = """blah 26 outStr = """blah 27 joe 27 joe 28 hello 28 hello 29 29 30 30 31 """ 31 """ 32 inStr = outStr + 'All is Well that' 32 inStr = outStr + 'All is Well that' 33 regex = make_re_from_phrase('blah') 33 regex = make_re_from_phrase('blah') 34 out = regex.search(inStr) 34 out = regex.search(inStr) 35 self.assertEquals(out.group(), outStr) 35 self.assertEquals(out.group(), outStr) 36 36 37 class GutenbergShakespeareTest(unittest.TestCase): 37 class GutenbergShakespeareTest(unittest.TestCase): 38 etext1 = file( download.get_cache_path('0ws2510.txt'))38 etext1 = file(utils.get_cache_path('0ws2510.txt')) 39 etext2 = file( download.get_cache_path('2ws2510.txt'))39 etext2 = file(utils.get_cache_path('2ws2510.txt')) 40 gut1 = GutenbergShakespeare(etext1) 40 gut1 = GutenbergShakespeare(etext1) 41 gut2 = GutenbergShakespeare(etext2) 41 gut2 = GutenbergShakespeare(etext2) 42 42 43 def test_get_header_end(self): 43 def test_get_header_end(self): 44 out = self.gut1.get_header_end() 44 out = self.gut1.get_header_end() 45 exp = self.gut1.etextStr.index("Executive Director's Notes:") 45 exp = self.gut1.etextStr.index("Executive Director's Notes:") 46 self.assertEqual(out, exp) 46 self.assertEqual(out, exp) 47 47 48 def test_get_footer_start(self): 48 def test_get_footer_start(self): 49 out = self.gut1.get_footer_start() 49 out = self.gut1.get_footer_start() 50 # has no footer 50 # has no footer 51 exp = len(self.gut1.etextStr) 51 exp = len(self.gut1.etextStr) 52 self.assertEqual(out, exp) 52 self.assertEqual(out, exp) 53 53 54 out = self.gut2.get_footer_start() 54 out = self.gut2.get_footer_start() 55 exp = self.gut2.etextStr.index("End of Project Gutenberg Etext of As You Like It by Shakespeare") 55 exp = self.gut2.etextStr.index("End of Project Gutenberg Etext of As You Like It by Shakespeare") 56 self.assertEqual(out, exp) 56 self.assertEqual(out, exp) 57 57 58 def test_get_notes_end(self): 58 def test_get_notes_end(self): 59 out = self.gut1.get_notes_end() 59 out = self.gut1.get_notes_end() 60 exp = self.gut1.etextStr.index("As you Like it\n\nActus") 60 exp = self.gut1.etextStr.index("As you Like it\n\nActus") 61 self.assertEqual(out, exp) 61 self.assertEqual(out, exp) 62 62 63 def test_extract_text(self): 63 def test_extract_text(self): 64 # [[TODO: run this test on all of the etexts]] 64 # [[TODO: run this test on all of the etexts]] 65 for gut in [self.gut1, self.gut2]: 65 for gut in [self.gut1, self.gut2]: 66 out = gut.extract_text() 66 out = gut.extract_text() 67 notFound = (out.find('Gutenberg') == -1) 67 notFound = (out.find('Gutenberg') == -1) 68 self.failUnless(notFound) 68 self.failUnless(notFound) 69 69 trunk/src/shakespeare/utils.py
Revision 1 Revision 2 1 import os 1 import os 2 import urllib 2 import urllib 3 3 4 import conf 4 import conf 5 5 6 def get_local_path(remoteUrl, version=''): 6 def get_local_path(remoteUrl, version=''): 7 """Get local path to text of remote url. 7 """Get local path to text of remote url. 8 @type: string giving version of text (''|'cleaned') 8 @type: string giving version of text (''|'cleaned') 9 """ 9 """ 10 host,path = urllib.splithost(remoteUrl) 10 host,path = urllib.splithost(remoteUrl) 11 name = os.path.basename(path) 11 name = os.path.basename(path) 12 name = version + name 12 name = version + name 13 localPath = get_cache_path(name) 13 localPath = get_cache_path(name) 14 return localPath 14 return localPath 15 15 16 def download_url(url): 16 def download_url(url): 17 localPath = get_local_path(url) 17 localPath = get_local_path(url) 18 urllib.urlretrieve(url, localPath) 18 urllib.urlretrieve(url, localPath) 19 19 20 def get_cache_path(offset): 20 def get_cache_path(offset): 21 "Get full path of file in cache given by offset." 21 "Get full path of file in cache given by offset." 22 return os.path.join(conf.CACHEDIR, offset) 22 return os.path.join(conf.CACHEDIR, offset) 23 24 def download_gutenberg_index(): 25 "Download the Gutenberg Index file GUTINDEX.ALL." 26 utils.download_url(conf.GUTINDEX) 27 trunk/src/shakespeare/work.py
Revision 1 Revision 2 1 import os 1 import os 2 import urllib3 2 4 from utils import * 3 import utils 5 import conf 4 import conf 6 5 7 def download_gutenberg_index():8 "Download the Gutenberg Index file GUTINDEX.ALL."9 download_url(conf.GUTINDEX)10 6 11 def download_all_shakespeare(): 7 class GutenbergIndex(object): 12 """Download from Project Gutenberg all the shakespeare texts listed in 8 """Parse the index of Gutenberg works so as to find Shakespeare works. 13 the index. 14 """ 9 """ 15 index = make_index()10 16 for item in index:11 def make_url(self, year, idStr): 17 download_url(item[1])12 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 18 13 19 def make_url(year, idStr): 14 def get_shakespeare_list(self): 20 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 15 """Get list of shakespeare works and urls. 16 Results are sorted by work title. 17 """ 18 # results have format [ title, url, comments ] 19 # folio in comments indicates it is a first folio 20 results = [ ["Sonnets", 'http://www.gutenberg.org/dirs/etext97/wssnt10.txt', ''] ] 21 plays = self._extract_shakespeare_works() 22 for play in plays: 23 url = self.make_url(play[1], play[2]) 24 results.append([play[0], url, play[3]]) 25 def compare_list(item1, item2): 26 if item1[0] > item2[0]: return 1 27 else: return -1 28 results.sort(compare_list) 29 return results 30 31 def _extract_shakespeare_works(self): 32 """Get non-copyrighted Shakespeare works from Gutenberg 33 Results consist of folio and one other 'standard' version. 34 @return: list consisting of tuples in form [title, year, id, comment] 35 """ 36 ff = file(utils.get_cache_path('GUTINDEX.ALL')) 37 results = [] 38 for line in ff.readlines(): 39 result = self.parse_line_for_folio(line) 40 if result: 41 results.append(result + ['folio']) 42 resultNormal = self.parse_line_for_normal(line) 43 if resultNormal: 44 results.append(resultNormal + ['']) 45 return results 46 47 def parse_line_for_normal(self, line): 48 "Parse GUTINDEX line for the 'normal' gutenberg shakespeare versions (i.e. not folio and out of copyright)." 49 if 'by William Shakespeare' in line and '[2' in line: 50 year = line[4:8] 51 tmp = line[9:] 52 endOfTitle = tmp.find(', by') 53 title = tmp[:endOfTitle] 54 startOfId = tmp.find('[2') 55 endOfId = tmp.find(']', startOfId) 56 idStr = tmp[startOfId+1:endOfId] 57 xstart = idStr.find('x') 58 idStr = idStr[:xstart] 59 return [title, year, idStr] &
