Subversion Repositories Code-Repo

Rev

Blame | Last modification | View Log | RSS feed

import string, re

common_digram = ['TH','HE','IN','ER','AN','RE','ED','ON','ES','ST','EN','AT','TO','NT','HA','ND','OU','EA','NG','AS','OR','TI','IS','ET','IT','AR','TE','SE','HI','OF']
common_trigram = ['THE','ING','AND','HER','ERE','ENT','THA','NTH','WAS','ETH','FOR','DTH']

def print_frequency(str, num_chars):
        # Calculate the frequency of 'num_chars' characters in 'str'
        frequency = dict()
        for i in range(0,len(str)+1-num_chars):
                result = input_stripped.count(input_stripped[i:i+num_chars])
                if result != 0:
                        if input_stripped[i:i+num_chars] not in frequency:
                                frequency[input_stripped[i:i+num_chars]] = 1
                        else:
                                frequency[input_stripped[i:i+num_chars]] += 1

        # Print out the frequency in decreasing order
        m = max(frequency.values())
        while m > 0:
                if m in frequency.values():
                        print m,':',
                        for entry in frequency:
                                if frequency[entry] == m:
                                        print entry,
                        print
                m = m - 1

if __name__ == '__main__':
        # input = raw_input("Enter string: ")
        input = 'XKJUROWMLLPXWZNPIMBVBQJCNOWXPCCHHVVFVSLLFVXHAZITYXOHULX \
                QOJAXELXZXMYJAQFSTSRULHHUCDSKBXKNJQIDALLPQSLLUHIAQFPBPC \
                IDSVCIHWHWEWTHBTXRLJNRSNCIHUVFFUXVOUKJLJSWMAQFVJWJSDYLJ \
                OGJXDBOXAJULTUCPZMPLIWMLUBZXVOODYBAFDSKXGQFADSHXNXEHSAR \
                UOJAQFPFKNDHSAAFVULLUWTAQFRUPWJRSZXGPFUTJQIYNRXNYNTWMHC'
        # Remove all whitespace from the input string
        input_stripped = re.sub('\s', '', input)
        
        # Print out the sample size
        sample_size = len(input_stripped)
        print "Sample Size:", sample_size
        
        # Print out the letter frequency and % for each letter
        print "Letter Frequency Count:"
        for letter in string.uppercase:
                count = input_stripped.count(letter)
                print "{0} : {1:<2} : {2:.2}".format(letter,count,float(count)/float(sample_size)) 

        # Print out the sorted frequency for letters
        print "Letter Frequency Count (Sorted):"
        print_frequency(input_stripped, 1)

        # Print out the sorted frequency for digrams
        print "Digram Frequency Count:"
        print_frequency(input_stripped, 2)

        # Print out the sorted frequency for trigrams
        print "Trigram Frequency Count:"
        print_frequency(input_stripped, 3)