Difference between revisions of "Script lexique de mots"

From Mondothèque

(Created page with "#!/usr/bin/env/ python import string # remove punctuation def remove_punct(f): tokens = (' '.join(line.replace('\n', '') for line in f)).lower() for c in string.punctuatio...")
 
(No difference)

Latest revision as of 06:49, 28 October 2015

  1. !/usr/bin/env/ python

import string

  1. remove punctuation

def remove_punct(f): tokens = (' '.join(line.replace('\n', ) for line in f)).lower() for c in string.punctuation: tokens= tokens.replace(c,"") return tokens

  1. add words of the text to set, a list of unique items

def lexicon(tokens): for word in tokens.split(" "): wordset.add(word) return wordset

  1. sort words alphabetically & write words to file

def publish(wordset): alphalist = sorted(list(wordset)) # can be reversed: (, reverse = True) for word in alphalist: words.write(word + "\n")

  1. define & open input/output file

f = open("1_notion.txt", "rt", encoding = "utf-8") words = open("mots.txt", 'wt', encoding = "utf-8") wordset = set()

  1. execute functions

tokens = remove_punct(f) wordset = lexicon(tokens) publish(wordset)

  1. close files

f.close() words.close()