基于 modern-poetry 的 NLP 暴力生成诗代码

原理与背景

背景当然是基于我之前做的 现代诗歌收集 制作的。如果想要了解,可以点击上面的链接👆。同时,也还可以点击一下 Poetry share;这个也是基于 modern poetry 制作的诗歌一言的网站。

原理是非常简单的统计原理。如果想要通过 NLP 来生成一首诗,首先我们要弄清楚诗的每一个位置,哪些字的出现次数比较高:

比如“无事小神仙”,无就在第一个字,事在第二个字,小在第三个字……

只要通过大数据,我们就能够统计在哪些位置哪些字出现频率最高;最后只要收集好数字之后就可以通过生成随机数来生成诗了!

单字概率

单字概率就是通过以上的方法进行统计,从而来“机器学习”来生成诗,代码如下:

import json
import random

#存储诗歌的文件名
name = ["0-49.json", "50-99.json","100-149.json", "150-199.json","200-249.json", "250-299.json","300-349.json", "350-399.json","400-449.json", "450-499.json","500-518.json"]
dict = {}

#整理字典 返回数组
def sortDict(dic):
   dic = dic.items()
   dic = list(dic)
   return sorted(dic, key=lambda x:x[1],reverse=True)

#统计不同长度的句子有多少
def numberDict():
    for i in name:
        jsonFile = json.load(open(i,encoding="utf-8"))
        for k in jsonFile:
            paragraphs = k["paragraphs"]
            for m in paragraphs:
                m = m.replace(",","").replace("。","").replace("、","").replace("(","").replace(")","").replace(".","").replace("!","").replace(";","")
                size = len(m)
                if str(size) not in dict:
                    dict[str(size)] = 1
                else:
                    dict[str(size)] += 1

    return sortDict(dict)

#统计在每个位置不同字的出现频率
def countWord():
    frequency = {}

    for i in range(0,15):
        frequency[i] = {}
        for z in name:
            jsonFile = json.load(open(z, encoding="utf-8"))
            for k in jsonFile:
                paragraphs = k["paragraphs"] #获取存储的paragraphs内容
                for m in paragraphs:
                    m = m.replace(",", "").replace("。", "").replace("、","").replace("(","").replace(")","").replace(".","").replace("!","").replace(";","")
                    if i > len(m) - 1:
                        continue
                    #获取句子第i位的词
                    zi = m[i]
                    #如果出现在词典里就+1 没有就加入
                    if str(zi) not in frequency[i]:
                        frequency[i][str(zi)] = 1
                    else:
                        frequency[i][str(zi)] += 1

    return frequency

#Dict = numberDict() #这只是看哪些长度的句子会比较多
frequencyDict = countWord()
sortedFrequencyDict = {}

for i in range(15):
    sortedFrequencyDict[i] = sortDict(frequencyDict[i])[0:100]

dictSortedFrequency = {}

#将整理后的数组重新整理成字典
for i in range(len(sortedFrequencyDict)):
   dictSortedFrequency[str(i)] = {}
   for k in range(len(sortedFrequencyDict[i])):
      dictSortedFrequency[str(i)][str(sortedFrequencyDict[i][k][0])] = sortedFrequencyDict[i][k][1]

#print(dictSortedFrequency)

#json_data = json.dumps(dictSortedFrequency, indent=4, ensure_ascii=False)

#with open("pos_top100.json",'w',encoding='utf-8') as f:
#f.write(json_data)

#输入内容
line = int(input("Please input how many lines do you want: "))
lineCollection = []
result = []
for temp in range(line):
    lineCollection.append(int(input("Please input how many characters you want: ")))

for k in lineCollection:
    sentence = ""
    for m in range(k):
        rand = random.randint(0,99)
        sentence += sortedFrequencyDict[m][rand][0]
    result.append(sentence)

print(result)

如果想要了解,可以去知乎 - 暴力写诗(二)(单字概率)详细了解。

但是这样生成出来的诗就缺少灵魂(虽然有时候可以有一些金句):

请许你长成而有
两那从和人水
和为水黑着么被后
用见中下

所以这就引申到了下面的方法

双字概率

双字概率简单来说,就是按两个字来进行分词,如果那些词出现的多,就把他们认为是配对的词。

举个例子,三个词三个词来分。如果要分ABC,如果AB出现的概率比BC高,那么就把AB分到一起,C分到单独分到一起。最后,把所有的AB存储起来,C存储起来,并且记录下来他们的频率。

但因为现代诗风格比较自由,每一句话的内容都不一样,所以针对这个问题可以采用这个方案:

  • 如果是偶数,那么全部按两个词来填充
  • 如果是奇数,随机生成一个奇数填充在两个二字词之间

分词代码:

import json

name = ["0-49.json", "50-99.json","100-149.json", "150-199.json","200-249.json", "250-299.json","300-349.json", "350-399.json","400-449.json", "450-499.json","500-518.json"]

# 取出所有的诗句
poem_sentence = []
for i in name:
    jsonFile = json.load(open(i,encoding="utf-8"))
    for k in jsonFile:
        paragraphs = k["paragraphs"]
        for m in paragraphs:
            m = m.replace(",","").replace("。","").replace("、","").replace("(","").replace(")","").replace(".","").replace("!","").replace(";","").replace(":","")
            poem_sentence.append(m)

sentence_count = len(poem_sentence)

# 合并成巨大的一段用于计数
total_text = ''.join(poem_sentence)
print('total sentence:', sentence_count)
# word_1 是单字词,word_2 是双字词
word_2 = []
word_1 = []

def split_word(word):
    num = 0
    num_1 = 0
    word = word[:3]
    
    word1 = word[:2]
    word2 = word[1:]
    # 巨量计算开始
    num = total_text.count(word1)
    num_1 = total_text.count(word2)
    if num >= num_1:
        return (word[2], word1)
    else:
        return (word[0], word2)

i = 0
for sentence in poem_sentence:
    if i%100==0:
        print( 'now parsing...{:.2f}%'.format(i/sentence_count*100))
    if len(sentence) - 2 < 3:
        continue
    word_2.append(sentence[:2])

    times = (len(sentence) - 2) // 3
    #核心的分词方法
    for k in range(1, times+1):
        word_3 = sentence[3*k-1:3*k+2]
        _split = split_word(word_3)
        word_1.append(_split[0])
        word_2.append(_split[1])
    i += 1

# 去除重复
word_1 = set(word_1)
word_2 = set(word_2)
            
json_data = json.dumps(list(word_1), indent=4, ensure_ascii=False)
with open('word_1.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

json_data = json.dumps(list(word_2), indent=4, ensure_ascii=False)
with open('word_2.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

计算频率代码:

import json

name = ["0-49.json", "50-99.json","100-149.json", "150-199.json","200-249.json", "250-299.json","300-349.json", "350-399.json","400-449.json", "450-499.json","500-518.json"]
poem_sentence = []
for i in name:
    jsonFile = json.load(open(i,encoding="utf-8"))
    for k in jsonFile:
        paragraphs = k["paragraphs"]
        for m in paragraphs:
            m = m.replace(",","").replace("。","").replace("、","").replace("(","").replace(")","").replace(".","").replace("!","").replace(";","").replace(":","")
            poem_sentence.append(m)

sentence_count = len(poem_sentence)

json_data = json.dumps(poem_sentence, indent=4, ensure_ascii=False)
with open('poem_sentence.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

sentence_count = len(poem_sentence)
total_text = ''.join(poem_sentence)
print('total sentence:', sentence_count)

# 开始统计
with open('word_2.json', 'r', encoding='utf-8-sig') as f:
    double_word_data = json.loads(f.read())
double_word_freq = {}
double_word_num = len(double_word_data)

for _, word in enumerate(double_word_data):
    if _%100 ==0:
        print(_/double_word_num *100)
    count = total_text.count(word)
    if count != 1:
        double_word_freq[word] = count

json_data = json.dumps(double_word_freq, indent=4, ensure_ascii=False)
with open('double_word_freq_fixed.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

构建完成频率表之后,那我们如何生成一个二字词呢?

这就需要构建一个字典,要让他实现如下功能:

  • 给出前字,可以给出后面那个字让他们组成一个二字词
  • 给出后面那个字,可以给出前面的字让他们组成一个二字词

代码如下:

import json

double_word_fix_freq = {}
with open('double_word_freq_fixed.json', 'r', encoding='utf-8-sig') as f:
    double_word_data = json.loads(f.read())

double_word_dict_head = {}
double_word_dict_end = {}
for _ in double_word_data.keys():
    if _[0] in double_word_dict_head.keys():
        double_word_dict_head[_[0]].append(_)
    else:
        double_word_dict_head[_[0]] = []
        double_word_dict_head[_[0]].append(_)

    if _[1] in double_word_dict_end.keys():
        double_word_dict_end[_[1]].append(_)
    else:
        double_word_dict_end[_[1]] = []
        double_word_dict_end[_[1]].append(_)

json_data = json.dumps(double_word_dict_head, indent=4, ensure_ascii=False)
with open('double_word_dict_head.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

json_data = json.dumps(double_word_dict_end, indent=4, ensure_ascii=False)
with open('double_word_dict_end.json', 'w', encoding='utf-8-sig') as f:
    f.write(json_data)

最后就是把他们做成我们的 NLP 2.0 版本程序了:

import json
import random

# 各位置的频率前100的字
with open('pos_top100.json', 'r', encoding='utf-8-sig') as f:
    pos_100_data = json.loads(f.read())
# 所有的双字词
with open('double_word_dict_head.json', 'r', encoding='utf-8-sig') as f:
    double_word_data = json.loads(f.read())

 # 根据首字,取一个随机的双字词
def get_double_word(word):
    _list = double_word_data[word][:int(len(double_word_data[word]) / 3) + 1]
    random.shuffle(_list)
    return _list[0]

def make_sentence(no):
    '''
    no: the sentence number
    type 0:$$$$$$@[for sentence cannot be divided by 2, randomly choose a place to put a character]
    type 1:$$$$$$[for sentence can be divided by 2]
    '''
    sentence = ''
    if no % 2 == 0:
        for i in range(no // 2):
            characterLis = list(pos_100_data[str(i*2)].items())
            word_1 = characterLis[random.randint(0, 99)][0]
            sentence += get_double_word(word_1)
    else:
        rand = random.randrange(100, 1000, 2)
        for i in range(0,no,2):
            if i < rand:
                characterLis = list(pos_100_data[str(i)].items())
                word_3 = characterLis[random.randint(0, 99)][0]
                sentence += get_double_word(word_3)
            else:
                sentence += pos_100_data[rand][random.randint(0, 99)][0]

    return sentence

line = int(input("Please input how many lines do you want: "))
lineCollection = []
result = []
for temp in range(line):
    lineCollection.append(int(input("Please input how many characters you want: ")))

for k in lineCollection:
    sentence = ""
    for m in range(k):
        sentence = make_sentence(m)
    result.append(sentence)

print(result)

经过改进的版本虽然还是有一些问题,但是写出来的诗还是好了很多(心理上):

带看声又手语
人怀在慢又回只孤
而把冬出河
自焚生中得鬼小说

参考

本文参考了知乎 刘南飞 的 NLP暴力写诗 系列,在这里向他致谢!