Python中文自然语言处理

NLP的理解

NLP的全称为Natural Language Understanding，也就是自然语言理解，可用于机器翻译、信息检索、文本分类等等应用，包括先前的聊天情感分析也是的。

Anaconda

Anaconda为一个开源的python发行版本，内含各种代码库和数学库。以下代码使用Spyder编写。

虚拟环境的创建

所有虚拟环境的下载在软件Anaconda Prompt中完成。

第一步：根据下载的python版本创建虚拟环境

本篇默认使用3.12.6的python版本，可根据自己的电脑python版本进行替换

在Anaconda Prompt中输入以下命令查看现在电脑的python版本：

python --version

返回Python 3.12.6，继续输入以下命令：

conda create -n NLP python=3.12.6 # 这里的python后接查看的版本数字

第二章

以下为肖刚主编的Python中文自然语言处理教程的实践第二章操作，但是说实话这个上面的代码有问题的部分比较多，可能是因为编写的环境和库的迭代导致的。

nltk使用代码

# In[1] 引入
from nltk.book import text1
from nltk.book import *
# In[2] 搜索与pretty相似的词汇
text1.similar('pretty')
# In[3] 指定搜索danger词汇文本
text1.concordance(word='danger')
# In[4] 统计文本长度
len(text1)
# In[5] 展示文本中多次出现的搭配词语
text1.collocations()
# In[6] 搜索词的共同上下文
text1.common_contexts(['monstrous','very'])
# In[6] 获取文本的词汇表
set(text1)
# In[7] 按照英文字母排序
sorted(set(text1)) # 词汇表排序
len(set(text1)) # 词汇表大小
len(text1)/len(set(text1)) # 每个词平均使用的次数
# In[8] 查询词汇频数分布
fdist = FreqDist(text1) # 查询文本中的词汇频数分布
fdist['ship'] # ship出现的次数
voc = fdist.most_common(10) # 出现频率最高的十个词
# In[9] 按照英文字母进行排序
len(text1) # 词汇表从头到尾的长度，包括文本text1中出现的词和标点符号
sorted(set(text1)) # 获取文本text1的词汇表，并且按照英文字母排序
len(set(text1)) # 获取文本text1词汇表的数量
len(text1)/len(set(text1)) # 每个词平均使用次数

词汇分布情况

# In[1] 引入
from nltk.book import text1
from nltk.book import *
# In[2] 搜索与pretty相似的词汇
text1.similar('pretty')
# In[3] 指定搜索danger词汇文本
text1.concordance(word='danger')
# In[4] 统计文本长度
len(text1)
# In[5] 展示文本中多次出现的搭配词语
text1.collocations()
# In[6] 搜索词的共同上下文
text1.common_contexts(['monstrous','very'])
# In[6] 获取文本的词汇表
set(text1)
# In[7] 按照英文字母排序
sorted(set(text1)) # 词汇表排序
len(set(text1)) # 词汇表大小
len(text1)/len(set(text1)) # 每个词平均使用的次数
# In[8] 查询词汇频数分布
fdist = FreqDist(text1) # 查询文本中的词汇频数分布
fdist['ship'] # ship出现的次数
voc = fdist.most_common(10) # 出现频率最高的十个词
# In[9] 按照英文字母进行排序
len(text1) # 词汇表从头到尾的长度，包括文本text1中出现的词和标点符号
sorted(set(text1)) # 获取文本text1的词汇表，并且按照英文字母排序
len(set(text1)) # 获取文本text1词汇表的数量
len(text1)/len(set(text1)) # 每个词平均使用次数

累积频率分布图

import matplotlib.pyplot as plt
from nltk.book import *

plt.rcParams['font.sans-serif'] = 'SimHei' # 设置可显示中文
plt.rcParams["axes.unicode_minus"] = False

plt.grid() # 绘制网格
fdist = FreqDist(text1) # 书上缺少，用于统计词频
fdist1 = dict(fdist) # 创建词典，用于收录统计的词频数
fdist1 = sorted(fdist.items(), key = lambda x:x[1], reverse=True) 
# 对fdist根据词频升序排列，key为比较对象，reserve为是否使用升序排序

x = [] # x表示词汇
y = [] # y表示对应的数量
for i in range(10):
    x.append(fdist1[i][0]) # 在x数组中向后添加元素fdist1[i][0]
    y.append(fdist1[i][1]) # 在y数组中向后添加元素fdist1[i][1]

t = 0
for i in range(len(y)):
    y[i] = y[i]+t
    t = y[i]
# 上段的for用于平均划分y轴

plt.plot(x, y)
plt.title('常用词累计频率图')
plt.ylabel('累计频率')
plt.xlabel('常用词')
plt.show()
# 绘制图像

古腾堡语料库

import nltk
nltk.corpus.gutenberg.fileids() #获取古腾堡语料库文本
# In[1] 打开文件并且统计词数
emma = nltk.corpus.gutenberg.words('austen-emma.txt') # 打开古腾堡语料库的一个文本
print(emma)
len(emma)
# In[2] 索引文本
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance('surprise')
# In[3] 获取古腾堡语料库中的所有文本及其统计信息
from nltk.corpus import gutenberg # 加载古腾堡语料库
for fileid in gutenberg.fileids():
    raw = gutenberg.raw(fileid) # 给出原始内容
    num_chars = len(raw) # 计算文本长度
    
    words = gutenberg.words(fileid) # 获取文本的词
    num_words = len(words) # 计算词数
    
    sents = gutenberg.sents(fileid) # 获取文本的句子
    num_sents = len(sents) #计算句子的数量
    
    vocab = set([w.lower() for w in gutenberg.words(fileid)])
    num_vocab = len(vocab)
    print('%d %d %d %s' % (num_chars, num_words, num_sents, fileid))
# In[3] 获取网络聊天文本
from nltk.corpus import webtext # 加载网络聊天语料库
for fileid in webtext.fileids():
    print((fileid,webtext.raw(fileid)))
# In[4] 查看网络聊天文本信息
for fileid in webtext.fileids():
    print(fileid, len(webtext.raw(fileid)),len(webtext.words(fileid)),
          len(webtext.sents(fileid)))
# In[5] 获取即时消息聊天会话语料库
from nltk.corpus import nps_chat # 加载即时消息聊天会话语料库
chatroom1 = nps_chat.posts('11-08-adults_705posts.xml')
chatroom2 = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom1[135])
print(chatroom2[123])

布朗语料库

from nltk.corpus import brown
from nltk.book import *
# In[1] 查看语料
print(brown.categories()) #查看类别
print(brown.words(categories='news')) # 查看news类别里的词
print(brown.words(fileids=['cb15'])) # 查看cb15文件里的词
print(brown.sents(categories=['news','editorial','reviews'])) #查看句子
# In[2] 比较不同文体之间情态动词的用法
news_texts = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_texts]) # w.lower将词汇转化为小写
modals = ['can','could','may','might','must','will']
for m in modals:
    print('%s: %d' %(m, fdist[m]))

路透社语料库

from nltk.corpus import reuters #加载路透社语料库
# In[1] 查看语料文档编号
reuters.categories() # 查看分类
len(reuters.categories())
reuters.fileids()
len(reuters.fileids())
print(reuters.fileids()[:5]) # 查看前五个文档
# In[2] 查看某编号下的语料信息
reuters.categories('test/14832')
# In[3] 查看指定类别下的编号文件
print(reuters.fileids(['barley','corn']))

就职演讲语料库

import nltk
from nltk.corpus import inaugural # 加载就职演说语料库
# In[1] 查看语料信息
len(inaugural.fileids()) # 查看文本个数
inaugural.fileids()
# In[2] 提取年份
print([fileid[:4] for fileid in inaugural.fileids()])
# In[]
# 获取语料库
from nltk.corpus import inaugural
import matplotlib.pyplot as plt

print(inaugural.fileids())
# 每个文本的年代都出现在他的文件名中。要从文件名中提取出年代，只需要使用fileid[:4]即可。
for fileid in inaugural.fileids():
    print(fileid[:4])
# 看看‘American’和‘citizen’随着时间推移的使用情况。
from nltk.corpus import inaugural
cfd=nltk.ConditionalFreqDist((target,fileid[:4])
                            for fileid in inaugural.fileids()
                            for w in inaugural.words(fileid)
                            for target in ['citizen','free']
                            if w.lower().startswith(target) )
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置可中文显示
plt.grid()
tt = ['citizen','free']
style = ['--','-']
for j,k in zip(tt,style):
    fdist = dict(cfd[j])
    fdist = sorted(fdist.items(),key = lambda x:x[0])
    x = []
    y = []
    for i in range(len(fdist)):
        x.append(fdist[i][0])
        y.append(fdist[i][1])
    x = list(map(lambda i:eval(i), x))
    plt.plot(x, y, linestyle=k, label = j)
    plt.xlabel('年份')
    plt.ylabel('计数')
    plt.legend()
plt.show()

查找书籍

# =============================================================================
# 注意：langconv需要另外下载 在Anaconda prompt中输入
# ## pip install langconv ##
# 需要翻墙！！！
# 另外，由于网站的变化，选取内容和书上不一样
# =============================================================================
# In[0]
from urllib.request import urlopen
# In[0] 配置语言转换，参考网址如下：
### https://pypi.org/project/langconv/ ###
from langconv.converter import LanguageConverter
from langconv.language.zh import zh_cn
lc_cn = LanguageConverter.from_language(zh_cn)
# In[1] 查找西游记内容
html1 = urlopen(url='https://www.gutenberg.org/cache/epub/23962/pg23962.txt').read()
html1 = html1.decode('utf-8')
lc_cn.convert(html1[2200:2384])
# In[2] 查找红楼梦内容
html1 = urlopen(url='https://www.gutenberg.org/cache/epub/24264/pg24264.txt').read()
html1 = html1.decode('utf-8')
lc_cn.convert(html1[2220:2670])

获取网络在线语料库文本


# In[0]
from urllib.request import urlopen
# In[0] 配置语言转换，参考网址如下：
### https://pypi.org/project/langconv/ ###
from langconv.converter import LanguageConverter
from langconv.language.zh import zh_cn
lc_cn = LanguageConverter.from_language(zh_cn)
# In[1] 查看三国志文本
text1 = urlopen(url='https://www.gutenberg.org/cache/epub/25606/pg25606.txt').read()
text1 = text1.decode('utf-8')
len(text1) # 查看文本长度
lc_cn.convert(text1[657:999])
# In[1] 查看窦娥冤文本
text2 = urlopen(url='https://www.gutenberg.org/cache/epub/25606/pg25606.txt').read()
text2 = text2.decode('utf-8')
len(text2) # 查看文本长度
lc_cn.convert(text2[657:1300])

语料库的构建与应用

# =============================================================================
# 由于这本书没有提供金庸小说的下载地址
# 只能自己下载了，这里我使用的小说下载网址如下：
# http://www.janpn.info/
# 不一定长期有效，注意自己更换自己下载的小说！
# =============================================================================
# In[0]
import nltk
from nltk.book import *
from nltk.corpus import PlaintextCorpusReader
# In[1] 获取保存的文件列表
corpus_root ='G:/桌面/pyhon/txtdata' # 这里的引号内防止下载的小说存放目录
### 注意，目录中的 / 要改成 \
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids() # 获取文件列表
# In[2] 统计金庸作品语料中总词量和平均每个词的使用次数
### 下面语句中参考链接：https://blog.csdn.net/qq_42008430/article/details/105300096
with open('G:/桌面/pyhon/txtdata/金庸作品全集精排（三联版）.txt', 
          encoding='utf-8') as f:
    str = f.read() # 读取文本
    len(set(str)) # 统计总用词量
    len(str)/len(set(str)) # 统计平均每个词的使用次数
print(len(set(str))) # 输出总用词量
print(len(str)/len(set(str)))
# In[3] 查看小龙女
print('小龙女，杨过，雕，侠分别出现以下次数：')
print(str.count('小龙女'))
print(str.count('杨过'))
print(str.count('雕'))
print(str.count('侠'))
# In[4] 查看部分文本
str[53940:54400]
# In[5] 统计并输出前30个高频词和高频标识符次数
fdist = FreqDist(str)
print(fdist.most_common(30))
# In[6] 查询指定区间的词汇的数量
from collections import Counter
W = Counter(str)
print('词频在0~99的词数量为：',len([w for w in W.values() if w < 100]))
print('词频在100~999的词数量为：',len([w for w in W.values() if w >= 100 
                           and w < 1000]))
print('词频在1000~4999的词数量为：',len([w for w in W.values() if w >= 1000
                           and w < 5000]))
print('词频在5000以上：',len([w for w in W.values() if w > 5000]))
# In[7] 利用jieba分词
# =============================================================================
# 因为用到jieba
# 在Anaconda Promp里输入以下命令：
# pip install jieba
# =============================================================================
import re
import jieba
print(str)
cleaned_data = ''.join(re.findall('[\u4e00-\u9fa5]',str))
wordlist = jieba.lcut(cleaned_data) # 结巴分词处理
text = nltk.Text(wordlist)
print(text)
# In[8] 查看指定单词上下文
text.concordance(word='侠', width=30, lines=10)
# In[9] 搜索相似的词语
text.similar(word='李莫愁', num = 10)
# In[10] 绘制词汇离散图
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['font.sans-serif'] = ['SimHei']
words = ['小龙女', '杨过', '郭靖', '黄蓉']
nltk.draw.dispersion.dispersion_plot(text, words, title='词汇离散图')
plt.show()

第三章

第一题

# In[1] 3-1
import re
text1 = '自然语言处理是研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。\
    自然语言处理是一门融语言学、计算机科学、数学于一体的科学。'
print('匹配到的结果是：', re.match('自然语言处理', text1))
print('匹配到的结果是：', re.match('语言处理', text1))

# In[2] 3-2
p_string = text1.split('。') # 以句号为分割号通过split函数切分text1
for line in p_string: # 按行读取p_string
      if re.match('自然语言处理', line) is not None:
          print(line)   # 查看当前行是否可以匹配到’自然语言处理‘，如果可以，输出这一行
          
# In[3] 3-3
print(re.search('通信', text1)) # 返回一个包含匹配信息的对象

# In[4] 3-4
print(re.findall('计算机', text1))

# In[5] 3-5
print(re.sub('自然语言处理', 'NLP', text1)) # 将自然语言处理替换为NLP

# In[6] 3-6
print(re.findall('自.语言处理', text1)) # .表示除换行符之外的任意一个字符

# In[7] 3-7
print(re.findall('[科数]学', text1)) # 匹配[]中的任意一个字符

# In[8] 3-8
print(re.findall('方法|计算机', text1)) # |表示或者

# In[9] 3-9
p_string = text1.split('。')
for line in p_string:
    if len(re.findall('方法|计算机', line)):
        print(line)

# In[10] 3-10 匹配有以“自”开头的字符串
p_string = text1.split('。')
for line in p_string:
    if len(re.findall('^自', line)):
        print(line)

# In[11] 3-11 匹配所有以“学”字结尾的字符串
p_string = text1.split('。')
for line in p_string:
    if len(re.findall('学$', line)):
        print(line)
        

# In[12] 3-12
text2='今年花落颜色改，明年花开复谁在?年年岁岁花相似，岁岁年年人不同。'
re.findall('年?',text2) #“年”最多重复1次
print(re.findall('年*',text2)) #“年”可以重复0或多次
re.findall('年+',text2) #“年”可以重复1次或多次
re.findall('年{1}',text2) #“年”重复1次
re.findall('年{2}',text2) #“年”重复2 次
re.findall('年{0,1}',text2) #“年”至少重复0次，至多重复1次
re.findall('年.+',text2) #以“年”开始，后面可以跟任意多个字符
re.findall('年+.',text2) #“年”可以重复1次或多次，后面跟任意字符
re.findall('年.?',text2) #“年”后面至多可以跟1个任意字符
re.findall('年.*', text2) #“年”后面可以跟任意多个字符
re.findall('年.+?',text2) #“年”后面可以跟1个任意字符，并且这两个字符最多重复1次
re.findall('年.*?',text2) #“年”后面允许不带其他字符的内容
re.findall('年?花',text2) #“花”前面的“年”最多重复1次
re.findall('年*花',text2) #“花”前面的“年”可以重复。或多次
re.findall('年+花',text2) #“花”前面的“年”可以重复1次或多次
re.findall('年{1}花',text2) #“花”前面的“年”重复1次
re.findall('年{2}花',text2) #“花”前面的“年”重复2次
re.findall('年{0,1}花',text2) #“花”前面的“年”至少重复0次，至多重复1次

# In[13]
text3 = 'Hello, everyone, 我是/ 陈_X/我的_/ 、邮箱，地址是。 wxid_6cp@16.co'
re.sub('\\d', '数字', text3) # 将text3中的数字替换为“数字”
re.sub(r'\d', '数字', text3) # 将数字替换为“数字”字符串
re.sub('[0-9]', '数字', text3) # 将数字替换为“数字”字符串
re.sub(r'\s', '', text3) # 删除空白字符串
re.sub(r'\w', '', text3) # 删除字、字母和数字
re.findall('\\b[a-zA-Z]+', text3) # 查找带多个英文字母的字符
re.findall('\\b[a-zA-Z]+\\b', text3) # 查找只带有字母的单词的字符

# In[14] 3-14 查看西游记的部分内容
from urllib.request import urlopen
import re
html1 = urlopen('https://www.gutenberg.org/cache/epub/23962/pg23962.txt').read()
html1 = html1.decode('utf-8')
text4 = html1[2269:2450] # 查看部分内容
print(text4)

# In[15] 3-15 过滤中文文本特殊符号的常用正则表达式示例

# 过滤掉所有英文字符、数字、和英文特殊字符
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[a-zA-Z]+[0-9]+','',text4))
# 除标点符号外，过滤掉所有标点符号、英文字符、数字和中英文特殊符号
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[+\
             ——! ? ~@#￥%……&*( )「」]+|[a-zA-Z]+[0-9]+','',text4))
# 过滤掉所有标点符号、英文字符、数字和中英特殊符号
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[+\
             ——! ? ~@#￥%……&*( )「」]+|[《》，。；;、：-]+|[a-zA-Z]+[0-9]+','',text4))

# In[16] 3-16 使用正则表达式提取姓名
text5 = 'J.Done:234-555-1234J.Smith:(888)555-1234A.Lee:(810)555-1234M.Jones:666.555.999'
name = re.findall(r'[A-Z][\s\.a-zA-Z]+', text5)
for i in name:
    print(i) # 输出人名
    
# In[17] 3-17 使用正则表达式提取电话号码
tel = re.findall('[0-9(][0-9- ).]+', text5)
for i in tel:
    print(i) # 输出电话号码

# In[18] 3-18 将文本中的姓名和电话号码一一对应
for i in zip(name, tel): # zip函数用于将元素打包
    print(i) # 输出人名和电话
    
# In[19] 3-19 提取网址和文本
text6 = '<a href="http://www.baidu.com">百度</a><a href = "http:www.weibo.com">微博</a>'
# 第一个“.*?”表示从a开始匹配到出现双引号之间的字符，第二个“.*?”表示提取的内容
url = re.findall('<a.*?"(.*?)">', text6)
# 第一个“.*?”表示从a开始匹配到出现“<”之间的字符，第二个“.*?”表示提取的内容
name = re.findall('<a.*?>(.*?)<', text6)
for i in range(len(name)):
    print(name[i] + '。' + url[i])

第二题

import re
text_num = '(888)555-1234'
goal = r'\(888\)'

match = re.search(goal, text_num)
if match != None:
    print(match)
else:
    print('未匹配到(888)')

第三题

import re
text_mail = '111111@qq.comabcdefg@126.comabc123@163.com'
mail = re.findall(r'[a-z\d].*?com', text_mail)
print(mail)

实训

# In[0]
from urllib.request import urlopen
import re
html1 = urlopen('https://www.gutenberg.org/cache/epub/25606/pg25606.txt').read()
html1 = html1.decode('utf-8')
text4 = html1[2000:2500] # 查看部分内容
print(text4)
# 过滤掉所有英文字符、数字、和英文特殊字符
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[a-zA-Z]+[0-9]+','',text4))
# 除标点符号外，过滤掉所有标点符号、英文字符、数字和中英文特殊符号
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[+\
             ——! ? ~@#￥%……&*( )「」]+|[a-zA-Z]+[0-9]+','',text4))
# 过滤掉所有标点符号、英文字符、数字和中英特殊符号
print(re.sub('[\[\]\s+\.\!\/_,$%^*(+\"\'?:&@#;<>=-]+|[+\
             ——! ? ~@#￥%……&*( )「」]+|[《》，。；;、：-]+|[a-zA-Z]+[0-9]+','',text4))