import urllib source = urllib.urlopen("http://cse.kut.ac.kr/").read() len_source = len(source) print "Initial length of source:", len_source source_new = None i = 0 while (i < len_source): found = False if (found != True and source[i] == "<" and source[i+1 : i+7].lower() == "script"): found = True; j = i + 7; while (True): if (source[j] == "<" and source[j+1] == "/" and source[j+2 : j+8].lower() == "script"): #print "found - script" source = source[:i] + ' ' + source[j+9:] break j = j + 1 if (j > len_source): break len_source = len(source) i = i + 1 len_source = len(source) #print len_source i = 0 while (i < len_source): found = False if (found != True and source[i] == "<" and source[i+1 : i+6].lower() == "style"): found = True; j = i + 6; while (True): if (source[j] == "<" and source[j+1] == "/" and source[j+2 : j+7].lower() == "style"): #print "found - style" source = source[:i] + ' ' + source[j+8:] break j = j + 1 if (j > len_source): break len_source = len(source) i = i + 1 len_source = len(source) #print len_source i = 0 while (i < len_source): found = False if (found != True and source[i] == "<"): found = True; j = i + 1; while (True): if (source[j] == ">"): #print "found - HTML tag" source = source[:i] + ' ' + source[j+1:] break j = j + 1 if (j > len_source): break len_source = len(source) i = i + 1 len_source = len(source) print "Last length of source:", len_source words = source.split() #for word in words: # print word print "Total num of plain words:", len(words) for word in words: print '"' + word + '" ',