2009 109
From techdaze
Looked at some examples:
Contents |
In-class, doing Exercise 12.3
Write a function called most_frequent that takes a string and prints the letters in decreasing order of frequency. Find text samples from several different languages and see how letter frequency varies between languages. Compare your results with the tables at http://wikipedia.org/wiki/Letter_frequencies.
Recall the Historgram function from Chapter 9, p.105:
def histogram(s): d = dict() for c in s: if c not in d: d[c] = 1 else: d[c] += 1 return d
farah: search and replace text
text = open("house_arrest.html").read() # replacements = {} # replacements["India"] = "*****"; # replacements["Azhar"] = "$$$$$$$"; import re replacements = [] replacements.append( (re.compile(r"\bindia\b", re.I), "******" ) ); replacements.append( (re.compile(r"azhar", re.I), "$$$$" ) ); for pattern, replacement in replacements: text = pattern.sub(replacement, text) print text
Stéphanie: PIL visualization of words frequency in a text
file = open("textfile.txt", "r") text = file.read() file.close() #getting rid of capital letters, punctuation, separating words words = text.lower().strip().split() def wordFrequency(): d = dict() for word in words: if word not in d: d[word] = 1 else: d[word] += 1 #transforming the dictionary into a tuple but inversing the values for sorting by number of occurences t=[] for i in d: t.append((d[i], i)) t.sort(reverse=True) #get rid of the occurrence numbers result=[] for occurence, word in t: result.append(word) return result wf = wordFrequency() print wf #PIL visualization from PIL import Image, ImageDraw, ImageFont im = Image.new("RGB", (800, 600)) draw = ImageDraw.Draw(im) font = ImageFont.truetype("ArnheBlo.ttf", 24) draw.rectangle((0,0) + im.size, fill="#FFF") for word in wf: draw.textsize draw.text((wf.index(word)*10,wf.index(word)*10), word, fill="#000", font=font) del draw im.save("test.png", "PNG")
Alex: HTML cloud
Use it like this:
cat myfile.txt | python thescript.py
Below: thescript.py
#! /usr/bin/env python # reading what is piped in: import sys text = sys.stdin.read() #making an array out of the input text words=text.lower().strip().split() #stop words file: see http://en.wikipedia.org/wiki/Stop_words # the file is downloadable at http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words f = open('stop_words.txt', 'r') stopwordslist = f.read() f.close() stopwords=stopwordslist.split() #counting the occurences dico = {} for w in words: if w not in dico: dico[w] = 1 else: dico[w] += 1 #How many time appear the most common word? -> no needed anymore #max = 0 # #for i in dico: # if dico[i] > max: # max = dico[i] #Declaring HTML header and styles page = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Conforming XHTML 1.0 Strict Template</title> <style> body { text-align: center; margin: 0; } p { text-transform: uppercase; margin:0; line-height:1em; border: 1px solid white; color: white; background-color: black; } p:hover { position: relative; background-color: red; cursor: default; } </style> </head> <body> """ print dico #Looking decreasively for the words and adding them to the page with a size proportional to their frequency #Commented because the code below is better #while max > 0: # for j in dico: # if dico[j] == max: # if j not in stopwords: # page += "<p style=\"font-size:%spx;float:left\">%s</p> " % (max*max*2, j) # max -= 1 #transforming the dictionary into a tuple but inversing the values for sorting by number of occurences t=[] for i in dico: t.append((dico[i], i)) t.sort(reverse=True) for j in t: # verify that the current entry isn't in the stopwords list if j[1] not in stopwords: page += "<p style=\"font-size:%spx;float:left\">%s</p> " % (j[0]*j[0]*2, j[1]) page += """ </body> </html> """ #write a html html = open('cloud_03.html', 'w') html.write(page) html.close()

