|
1 #! /usr/bin/env python |
|
2 |
|
3 class Markov: |
|
4 def __init__(self, histsize, choice): |
|
5 self.histsize = histsize |
|
6 self.choice = choice |
|
7 self.trans = {} |
|
8 def add(self, state, next): |
|
9 if not self.trans.has_key(state): |
|
10 self.trans[state] = [next] |
|
11 else: |
|
12 self.trans[state].append(next) |
|
13 def put(self, seq): |
|
14 n = self.histsize |
|
15 add = self.add |
|
16 add(None, seq[:0]) |
|
17 for i in range(len(seq)): |
|
18 add(seq[max(0, i-n):i], seq[i:i+1]) |
|
19 add(seq[len(seq)-n:], None) |
|
20 def get(self): |
|
21 choice = self.choice |
|
22 trans = self.trans |
|
23 n = self.histsize |
|
24 seq = choice(trans[None]) |
|
25 while 1: |
|
26 subseq = seq[max(0, len(seq)-n):] |
|
27 options = trans[subseq] |
|
28 next = choice(options) |
|
29 if not next: break |
|
30 seq = seq + next |
|
31 return seq |
|
32 |
|
33 def test(): |
|
34 import sys, string, random, getopt |
|
35 args = sys.argv[1:] |
|
36 try: |
|
37 opts, args = getopt.getopt(args, '0123456789cdw') |
|
38 except getopt.error: |
|
39 print 'Usage: markov [-#] [-cddqw] [file] ...' |
|
40 print 'Options:' |
|
41 print '-#: 1-digit history size (default 2)' |
|
42 print '-c: characters (default)' |
|
43 print '-w: words' |
|
44 print '-d: more debugging output' |
|
45 print '-q: no debugging output' |
|
46 print 'Input files (default stdin) are split in paragraphs' |
|
47 print 'separated blank lines and each paragraph is split' |
|
48 print 'in words by whitespace, then reconcatenated with' |
|
49 print 'exactly one space separating words.' |
|
50 print 'Output consists of paragraphs separated by blank' |
|
51 print 'lines, where lines are no longer than 72 characters.' |
|
52 histsize = 2 |
|
53 do_words = 0 |
|
54 debug = 1 |
|
55 for o, a in opts: |
|
56 if '-0' <= o <= '-9': histsize = eval(o[1:]) |
|
57 if o == '-c': do_words = 0 |
|
58 if o == '-d': debug = debug + 1 |
|
59 if o == '-q': debug = 0 |
|
60 if o == '-w': do_words = 1 |
|
61 if not args: args = ['-'] |
|
62 m = Markov(histsize, random.choice) |
|
63 try: |
|
64 for filename in args: |
|
65 if filename == '-': |
|
66 f = sys.stdin |
|
67 if f.isatty(): |
|
68 print 'Sorry, need stdin from file' |
|
69 continue |
|
70 else: |
|
71 f = open(filename, 'r') |
|
72 if debug: print 'processing', filename, '...' |
|
73 text = f.read() |
|
74 f.close() |
|
75 paralist = string.splitfields(text, '\n\n') |
|
76 for para in paralist: |
|
77 if debug > 1: print 'feeding ...' |
|
78 words = string.split(para) |
|
79 if words: |
|
80 if do_words: data = tuple(words) |
|
81 else: data = string.joinfields(words, ' ') |
|
82 m.put(data) |
|
83 except KeyboardInterrupt: |
|
84 print 'Interrupted -- continue with data read so far' |
|
85 if not m.trans: |
|
86 print 'No valid input files' |
|
87 return |
|
88 if debug: print 'done.' |
|
89 if debug > 1: |
|
90 for key in m.trans.keys(): |
|
91 if key is None or len(key) < histsize: |
|
92 print repr(key), m.trans[key] |
|
93 if histsize == 0: print repr(''), m.trans[''] |
|
94 print |
|
95 while 1: |
|
96 data = m.get() |
|
97 if do_words: words = data |
|
98 else: words = string.split(data) |
|
99 n = 0 |
|
100 limit = 72 |
|
101 for w in words: |
|
102 if n + len(w) > limit: |
|
103 print |
|
104 n = 0 |
|
105 print w, |
|
106 n = n + len(w) + 1 |
|
107 print |
|
108 print |
|
109 |
|
110 def tuple(list): |
|
111 if len(list) == 0: return () |
|
112 if len(list) == 1: return (list[0],) |
|
113 i = len(list)//2 |
|
114 return tuple(list[:i]) + tuple(list[i:]) |
|
115 |
|
116 if __name__ == "__main__": |
|
117 test() |