Blame | Last modification | View Log | RSS feed
import string, re
common_digram = ['TH','HE','IN','ER','AN','RE','ED','ON','ES','ST','EN','AT','TO','NT','HA','ND','OU','EA','NG','AS','OR','TI','IS','ET','IT','AR','TE','SE','HI','OF']
common_trigram = ['THE','ING','AND','HER','ERE','ENT','THA','NTH','WAS','ETH','FOR','DTH']
def print_frequency(str, num_chars):
# Calculate the frequency of 'num_chars' characters in 'str'
frequency = dict()
for i in range(0,len(str)+1-num_chars):
result = input_stripped.count(input_stripped[i:i+num_chars])
if result != 0:
if input_stripped[i:i+num_chars] not in frequency:
frequency[input_stripped[i:i+num_chars]] = 1
else:
frequency[input_stripped[i:i+num_chars]] += 1
# Print out the frequency in decreasing order
m = max(frequency.values())
while m > 0:
if m in frequency.values():
print m,':',
for entry in frequency:
if frequency[entry] == m:
print entry,
m = m - 1
if __name__ == '__main__':
# input = raw_input("Enter string: ")
input = 'XKJUROWMLLPXWZNPIMBVBQJCNOWXPCCHHVVFVSLLFVXHAZITYXOHULX \
QOJAXELXZXMYJAQFSTSRULHHUCDSKBXKNJQIDALLPQSLLUHIAQFPBPC \
IDSVCIHWHWEWTHBTXRLJNRSNCIHUVFFUXVOUKJLJSWMAQFVJWJSDYLJ \
OGJXDBOXAJULTUCPZMPLIWMLUBZXVOODYBAFDSKXGQFADSHXNXEHSAR \
UOJAQFPFKNDHSAAFVULLUWTAQFRUPWJRSZXGPFUTJQIYNRXNYNTWMHC'
# Remove all whitespace from the input string
input_stripped = re.sub('\s', '', input)
# Print out the sample size
sample_size = len(input_stripped)
print "Sample Size:", sample_size
# Print out the letter frequency and % for each letter
print "Letter Frequency Count:"
for letter in string.uppercase:
count = input_stripped.count(letter)
print "{0} : {1:<2} : {2:.2}".format(letter,count,float(count)/float(sample_size))
# Print out the sorted frequency for letters
print "Letter Frequency Count (Sorted):"
print_frequency(input_stripped, 1)
# Print out the sorted frequency for digrams
print "Digram Frequency Count:"
print_frequency(input_stripped, 2)
# Print out the sorted frequency for trigrams
print "Trigram Frequency Count:"
print_frequency(input_stripped, 3)