| 140 |
Kevin |
1 |
import string, re
|
|
|
2 |
|
|
|
3 |
common_digram = ['TH','HE','IN','ER','AN','RE','ED','ON','ES','ST','EN','AT','TO','NT','HA','ND','OU','EA','NG','AS','OR','TI','IS','ET','IT','AR','TE','SE','HI','OF']
|
|
|
4 |
common_trigram = ['THE','ING','AND','HER','ERE','ENT','THA','NTH','WAS','ETH','FOR','DTH']
|
|
|
5 |
|
|
|
6 |
def print_frequency(str, num_chars):
|
|
|
7 |
# Calculate the frequency of 'num_chars' characters in 'str'
|
|
|
8 |
frequency = dict()
|
|
|
9 |
for i in range(0,len(str)+1-num_chars):
|
|
|
10 |
result = input_stripped.count(input_stripped[i:i+num_chars])
|
|
|
11 |
if result != 0:
|
|
|
12 |
if input_stripped[i:i+num_chars] not in frequency:
|
|
|
13 |
frequency[input_stripped[i:i+num_chars]] = 1
|
|
|
14 |
else:
|
|
|
15 |
frequency[input_stripped[i:i+num_chars]] += 1
|
|
|
16 |
|
|
|
17 |
# Print out the frequency in decreasing order
|
|
|
18 |
m = max(frequency.values())
|
|
|
19 |
while m > 0:
|
|
|
20 |
if m in frequency.values():
|
|
|
21 |
print m,':',
|
|
|
22 |
for entry in frequency:
|
|
|
23 |
if frequency[entry] == m:
|
|
|
24 |
print entry,
|
|
|
25 |
print
|
|
|
26 |
m = m - 1
|
|
|
27 |
|
|
|
28 |
if __name__ == '__main__':
|
|
|
29 |
# input = raw_input("Enter string: ")
|
|
|
30 |
input = 'XKJUROWMLLPXWZNPIMBVBQJCNOWXPCCHHVVFVSLLFVXHAZITYXOHULX \
|
|
|
31 |
QOJAXELXZXMYJAQFSTSRULHHUCDSKBXKNJQIDALLPQSLLUHIAQFPBPC \
|
|
|
32 |
IDSVCIHWHWEWTHBTXRLJNRSNCIHUVFFUXVOUKJLJSWMAQFVJWJSDYLJ \
|
|
|
33 |
OGJXDBOXAJULTUCPZMPLIWMLUBZXVOODYBAFDSKXGQFADSHXNXEHSAR \
|
|
|
34 |
UOJAQFPFKNDHSAAFVULLUWTAQFRUPWJRSZXGPFUTJQIYNRXNYNTWMHC'
|
|
|
35 |
# Remove all whitespace from the input string
|
|
|
36 |
input_stripped = re.sub('\s', '', input)
|
|
|
37 |
|
|
|
38 |
# Print out the sample size
|
|
|
39 |
sample_size = len(input_stripped)
|
|
|
40 |
print "Sample Size:", sample_size
|
|
|
41 |
|
|
|
42 |
# Print out the letter frequency and % for each letter
|
|
|
43 |
print "Letter Frequency Count:"
|
|
|
44 |
for letter in string.uppercase:
|
|
|
45 |
count = input_stripped.count(letter)
|
|
|
46 |
print "{0} : {1:<2} : {2:.2}".format(letter,count,float(count)/float(sample_size))
|
|
|
47 |
|
|
|
48 |
# Print out the sorted frequency for letters
|
|
|
49 |
print "Letter Frequency Count (Sorted):"
|
|
|
50 |
print_frequency(input_stripped, 1)
|
|
|
51 |
|
|
|
52 |
# Print out the sorted frequency for digrams
|
|
|
53 |
print "Digram Frequency Count:"
|
|
|
54 |
print_frequency(input_stripped, 2)
|
|
|
55 |
|
|
|
56 |
# Print out the sorted frequency for trigrams
|
|
|
57 |
print "Trigram Frequency Count:"
|
|
|
58 |
print_frequency(input_stripped, 3)
|