0,0 → 1,58 |
import string, re |
|
common_digram = ['TH','HE','IN','ER','AN','RE','ED','ON','ES','ST','EN','AT','TO','NT','HA','ND','OU','EA','NG','AS','OR','TI','IS','ET','IT','AR','TE','SE','HI','OF'] |
common_trigram = ['THE','ING','AND','HER','ERE','ENT','THA','NTH','WAS','ETH','FOR','DTH'] |
|
def print_frequency(str, num_chars): |
# Calculate the frequency of 'num_chars' characters in 'str' |
frequency = dict() |
for i in range(0,len(str)+1-num_chars): |
result = input_stripped.count(input_stripped[i:i+num_chars]) |
if result != 0: |
if input_stripped[i:i+num_chars] not in frequency: |
frequency[input_stripped[i:i+num_chars]] = 1 |
else: |
frequency[input_stripped[i:i+num_chars]] += 1 |
|
# Print out the frequency in decreasing order |
m = max(frequency.values()) |
while m > 0: |
if m in frequency.values(): |
print m,':', |
for entry in frequency: |
if frequency[entry] == m: |
print entry, |
print |
m = m - 1 |
|
if __name__ == '__main__': |
# input = raw_input("Enter string: ") |
input = 'XKJUROWMLLPXWZNPIMBVBQJCNOWXPCCHHVVFVSLLFVXHAZITYXOHULX \ |
QOJAXELXZXMYJAQFSTSRULHHUCDSKBXKNJQIDALLPQSLLUHIAQFPBPC \ |
IDSVCIHWHWEWTHBTXRLJNRSNCIHUVFFUXVOUKJLJSWMAQFVJWJSDYLJ \ |
OGJXDBOXAJULTUCPZMPLIWMLUBZXVOODYBAFDSKXGQFADSHXNXEHSAR \ |
UOJAQFPFKNDHSAAFVULLUWTAQFRUPWJRSZXGPFUTJQIYNRXNYNTWMHC' |
# Remove all whitespace from the input string |
input_stripped = re.sub('\s', '', input) |
|
# Print out the sample size |
sample_size = len(input_stripped) |
print "Sample Size:", sample_size |
|
# Print out the letter frequency and % for each letter |
print "Letter Frequency Count:" |
for letter in string.uppercase: |
count = input_stripped.count(letter) |
print "{0} : {1:<2} : {2:.2}".format(letter,count,float(count)/float(sample_size)) |
|
# Print out the sorted frequency for letters |
print "Letter Frequency Count (Sorted):" |
print_frequency(input_stripped, 1) |
|
# Print out the sorted frequency for digrams |
print "Digram Frequency Count:" |
print_frequency(input_stripped, 2) |
|
# Print out the sorted frequency for trigrams |
print "Trigram Frequency Count:" |
print_frequency(input_stripped, 3) |