"""WordCatDSType01.py (Word frequency and length analysis program) Copyright 2004, Toshikazu Ikuta , Indiana University This code is written and distributed under the GNU General Public License which means that its source code is freely-distributed and available to the general public. See http://www.gnu.org/copyleft/gpl.html for details on the license or the the file gpl.txt that should always be distributed with this code. """ """ INPUT Input text should be line by line """ import string, sys, os.path, re, glob dictionary = "kf.wds" dictio = open(dictionary, 'r') fc = re.compile("^or$|^and$|^of$|^for$|^the$|^on$|^or$|^with$|^in$|^to$|^a$") wordlist = [] typelist = {} dict=[] counter = 0 word ='' listt = {} wlist = [] for line in dictio.readlines(): dict.append([]) dict[counter] = string.split(line) counter +=1 for i in range(len(dict)): dict[i][0] = string.lower(dict[i][0]) #print dict def cat(filename): #wordlist = [] #filename = raw_input('Enter file name: ') #filename = "Cont AndK.txt" infile = open(filename, 'r') outfile = open("Re-TypeFreq-"+filename[21:-4] + ".csv", "w+") #outfile.write(filename) #print "\n\n\n\n\n" #print filename #print "\n\n\n\n" global wordlist wordlist = [] templist = [] count=0 #category index counte =0 #word index in a category for line in infile.readlines(): templist = string.split(line) for i in range(len(templist)): if typelist.has_key(templist[i]): typelist[templist[i]] += 1 else: typelist[templist[i]] = 1 for k in typelist: wordlist.append([k,typelist[k]]) for i in range(len(wordlist)): #for j in range(len(wordlist[i])): # if j > 1: #MODIFIED #for h in range(len(wordlist[i][j])-1): # wordlist[i][j][h+1].append(1) wordlist[i].append(0) for k in range(len(dict)): if len(dict[k])>1: if dict[k][1] == wordlist[i][0]: wordlist[i][2] = dict[k][0] if wordlist[i][1] == 1: """ if the word is not in the dictionary it can be a compound word The following lines are for compound words """ posit = 0 llist = [] cmp = 0 for s in range(len(wordlist[i][0])): """try to find - , then take it as a boundary""" if wordlist[i][0][s] == "-": cmp = 1 tempW = wordlist[i][0][posit:s] posit = s+1 if fc.match(tempW): continue else: llist.append(tempW) if cmp == 1: """ if - is previously found i.e. if it's a compound """ if not fc.match(wordlist[i][0][posit:]): llist.append(wordlist[i][posit:]) frq = 0.0 for t in range(len(llist)): for b in range(len(dict)): if len(dict[b])>1: if dict[b][1] == llist[t]: frq += int(dict[b][0]) wordlist[i][2] = float(frq/len(llist)) """ for i in range(len(wordlist)): for j in range (len(wordlist[i])): if j > 1: #MODIFIED tempf = 0.0 cnt = 0 for h in range(len(wordlist[i][j])): if h != 0: #print wordlist[i][j][h][1] tempf += int(wordlist[i][j][h][1]) cnt += 1 #print wordlist[i][j] if cnt > 0: wordlist[i][j][0][1] = float(tempf/cnt)#(len(wordlist[i][j])-2)) #else: #wordlist[i][j][0][1] = 0.0 """ """OUTPUT PART""" for i in range(len(wordlist)): for j in range (len(wordlist[i])): if j == 0: outfile.write("\n"+wordlist[i][j]) else: outfile.write("," + str(wordlist[i][j])) """ else: for h in range(len(wordlist[i][j])): if h == 0: outfile.write("\n,,"+ str(wordlist[i][j][h][0])) #The RESPONCE PERIOD is printed rt = wordlist[i][j][h][0] outfile.write(","+ str(wordlist[i][j][h][1])) #The AVERAGE FREQ is printed else: #print "\n\n", wordlist[i][j][h][0] outfile.write("\n,,,,"+ wordlist[i][j][h][0]) #The WORD is printed wd = wordlist[i][j][h][0] #print wd outfile.write(","+ str(wordlist[i][j][h][1])) #The FREQ is printed freq = wordlist[i][j][h][1] if listt.has_key(wd): listt[wd]+= (4-rt) else: listt[wd]= (4-rt) """ infile.close() outfile.close() if __name__ == '__main__': for i in sys.argv[1:]: fspec = os.path.normcase(i) lfiles = glob.glob(fspec) for x in lfiles: cat(x) name = x #ga()