from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup
wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
# print(xml_content.decode('utf-8'))
wordObj = BeautifulSoup(xml_content.decode('utf-8'))
textStrings = wordObj.findAll("w:t")
for textElem in textStrings:
closeTag = ""
try:
style = textElem.parent.previousSibling.find("w:pstyle")
if style is not None and style["w:val"] == "Title":
print("")
closeTag = ""
except AttributeError:
#不打印标签
pass
print(textElem.text)
print(closeTag)
# 参考:《Python网络数据采集》
import docx
# 加载Word文档
doc = docx.Document('test.docx')
# 统计段落数
print(len(doc.paragraphs))
# 段落
print(doc.paragraphs[0].text)
print(doc.paragraphs[0].style)
# 加入段落
doc.add_paragraph('Hello')
# Run对象
print(len(doc.paragraphs[0].runs))
print(doc.paragraphs[0].runs[0].text)
print(doc.paragraphs[0].runs[0].style)
doc.paragraphs[0].runs[0].underline = True
# 加入Run对象
doc.add_paragraph('Word').add_run(' hahaha')
doc.save('test.docx')
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于