2009 109

From techdaze

Jump to: navigation, search

Looked at some examples:

Contents

In-class, doing Exercise 12.3

Write a function called most_frequent that takes a string and prints the letters in decreasing order of frequency. Find text samples from several different languages and see how letter frequency varies between languages. Compare your results with the tables at http://wikipedia.org/wiki/Letter_frequencies.

Recall the Historgram function from Chapter 9, p.105:

def histogram(s):
	d = dict()
	for c in s:
		if c not in d:
			d[c] = 1
		else:
			d[c] += 1
	return d

Python Image Library


farah: search and replace text

text = open("house_arrest.html").read()
 
 
 
# replacements = {}
# replacements["India"] =  "*****";
# replacements["Azhar"] =  "$$$$$$$";
 
import re
replacements = []
replacements.append( (re.compile(r"\bindia\b", re.I), "******" ) );
replacements.append( (re.compile(r"azhar", re.I), "$$$$" ) );
 
 
 
for pattern, replacement in replacements:
	text = pattern.sub(replacement, text)
 
print text


Stéphanie: PIL visualization of words frequency in a text

file = open("textfile.txt", "r")
text = file.read()
file.close()
 
#getting rid of capital letters, punctuation, separating words
words = text.lower().strip().split()
 
def wordFrequency():
    d = dict()
    for word in words:
        if word not in d:
            d[word] = 1
        else:
            d[word] += 1
 
    #transforming the dictionary into a tuple but inversing the values for sorting by number of occurences
    t=[]
    for i in d:
        t.append((d[i], i)) 
    t.sort(reverse=True)
 
    #get rid of the occurrence numbers
    result=[]
    for occurence, word in t:
        result.append(word)
    return result
 
wf = wordFrequency()
print wf
 
 
#PIL visualization
from PIL import Image, ImageDraw, ImageFont
 
im = Image.new("RGB", (800, 600))
draw = ImageDraw.Draw(im)
font = ImageFont.truetype("ArnheBlo.ttf", 24)
draw.rectangle((0,0) +  im.size, fill="#FFF")
 
for word in wf:
    draw.textsize
    draw.text((wf.index(word)*10,wf.index(word)*10), word, fill="#000", font=font)
 
del draw
im.save("test.png", "PNG")


Alex: HTML cloud

Use it like this:

cat myfile.txt | python thescript.py


Below: thescript.py

#! /usr/bin/env python
 
# reading what is piped in:
import sys
text = sys.stdin.read()
#making an array out of the input text
words=text.lower().strip().split()
 
#stop words file: see http://en.wikipedia.org/wiki/Stop_words 
# the file is downloadable at http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words
f = open('stop_words.txt', 'r')
stopwordslist = f.read()
f.close()
 
stopwords=stopwordslist.split()
 
#counting the occurences
dico = {}
 
for w in words:
    if w not in dico:
        dico[w] = 1
    else:
        dico[w] += 1
 
#How many time appear the most common word? -> no needed anymore
#max = 0
#
#for i in dico:
#    if dico[i] > max:
#        max = dico[i]
 
#Declaring HTML header and styles
page = """
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 
    <html xmlns="http://www.w3.org/1999/xhtml">
        <head>
            <title>Conforming XHTML 1.0 Strict Template</title>
            <style>
                body {
                    text-align: center;
                    margin: 0;
                }
                p {
                    text-transform: uppercase;
                    margin:0;
                    line-height:1em;
                    border: 1px solid white;
                    color: white;
                    background-color: black;
                }
                p:hover {
                    position: relative;
                    background-color: red;
                    cursor: default;
                }
            </style>
        </head>
 
        <body>
 
"""
 
print dico
 
#Looking decreasively for the words and adding them to the page with a size proportional to their frequency
#Commented because the code below is better 
#while max > 0:
#    for j in dico:
#        if dico[j] == max:
#            if j not in stopwords:
#                page += "<p style=\"font-size:%spx;float:left\">%s</p> " % (max*max*2, j)
#    max -= 1
 
#transforming the dictionary into a tuple but inversing the values for sorting by number of occurences
t=[]
for i in dico:
    t.append((dico[i], i)) 
t.sort(reverse=True)
 
for j in t:
    # verify that the current entry isn't in the stopwords list
    if j[1] not in stopwords:
        page += "<p style=\"font-size:%spx;float:left\">%s</p> " % (j[0]*j[0]*2, j[1])
 
page += """
        </body>
    </html>
"""
 
#write a html
html = open('cloud_03.html', 'w')
html.write(page)
html.close()
Personal tools