from urllib.request import urlopen from random import randint def wordListSum(wordList): sum = 0 for word, value in wordList.items(): sum += value return sum def retrieveRandomWord(wordList): randIndex = randint(1, wordListSum(wordList)) for word, value in wordList.items(): randIndex -= value if randIndex <= 0: return word def buildWordDict(text): # 剔除换行符和引号 text = text.replace("\n", "") text = text.replace("\"", "") # 保证每个标点符号都和前面的单词在一起 # 这样不会被剔除,保留在马尔可夫链中 punctuation = [',', '.', ';', ':'] for symbol in punctuation: text = text.replace(symbol, " " + symbol + " ") words = text.split(" ") # 过滤空单词 words = [word for word in words if word != ""] wordDict = {} for i in range(1, len(words)): if words[i-1] not in wordDict: # 为单词新建一个词典 wordDict[words[i-1]] = {} if words[i] not in wordDict[words[i-1]]: wordDict[words[i-1]][words[i]] = 0 wordDict[words[i-1]][words[i]] = wordDict[words[i-1]][words[i]] + 1 return wordDict text = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8') wordDict = buildWordDict(text) # 生成链长为100的马尔可夫链 length = 100 chain = "" currentWord = "I" for i in range(0, length): chain += currentWord + " " currentWord = retrieveRandomWord(wordDict[currentWord]) print(chain)
参考:《Python 网络数据采集》
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于