from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum = 0
for word, value in wordList.items():
sum += value
return sum
def retrieveRandomWord(wordList):
randIndex = randint(1, wordListSum(wordList))
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word
def buildWordDict(text):
# 剔除换行符和引号
text = text.replace("\n", "")
text = text.replace("\"", "")
# 保证每个标点符号都和前面的单词在一起
# 这样不会被剔除,保留在马尔可夫链中
punctuation = [',', '.', ';', ':']
for symbol in punctuation:
text = text.replace(symbol, " " + symbol + " ")
words = text.split(" ")
# 过滤空单词
words = [word for word in words if word != ""]
wordDict = {}
for i in range(1, len(words)):
if words[i-1] not in wordDict:
# 为单词新建一个词典
wordDict[words[i-1]] = {}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]] = wordDict[words[i-1]][words[i]] + 1
return wordDict
text = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8')
wordDict = buildWordDict(text)
# 生成链长为100的马尔可夫链
length = 100
chain = ""
currentWord = "I"
for i in range(0, length):
chain += currentWord + " "
currentWord = retrieveRandomWord(wordDict[currentWord])
print(chain)
参考:《Python 网络数据采集》
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于