自然语言处理-自学笔记-07 RNN的应用 RNN的应用 一对一RNN 一对多RNN 多对一RNN 多对多RNN 用RNN产生文本 困惑度:衡量文本生成质量 RNN-CF






一对多RNN接受一个输入并输出一个序列。这里假设输入 彼此独立,不需要用先前的输入的相关信息来预测当前输入。但是需要循环连接,尽管处理单个输入,但输出是依赖于先前输出值的一系列值。可以用来生成图像标题。





用RNN产生文本 from __future__ import print_function import collections import math import numpy as np import os import random import tensorflow as tf import zipfile from matplotlib import pylab from six.moves import range from six.moves.urllib.request import urlretrieve import tensorflow as tf from scipy.sparse import lil_matrix #import nltk #nltk.download() #tokenizers/punkt/PY3/english.pickle url = 'https://www.cs.cmu.edu/~spok/grimmtmp/' # Create a directory if needed dir_name = 'stories' if not os.path.exists(dir_name): os.mkdir(dir_name) def maybe_download(filename): """Download a file if not present""" print('Downloading file: ', dir_name + os.sep + filename) if not os.path.exists(dir_name + os.sep + filename): filename, _ = urlretrieve(url + filename, dir_name + os.sep + filename) else: print('File ', filename, ' already exists.') return filename num_files = 100 filenames = [format(i, '03d') + '.txt' for i in range(1, 101)] for fn in filenames: maybe_download(fn) def read_data(filename): with open(filename) as f: data = tf.compat.as_str(f.read()) data = data.lower() data = list(data) return data documents = [] for i in range(num_files): print('\nProcessing file %s' % os.path.join(dir_name, filenames[i])) chars = read_data(os.path.join(dir_name, filenames[i])) two_grams = [''.join(chars[ch_i:ch_i + 2]) for ch_i in range(0, len(chars) - 2, 2)] documents.append(two_grams) print('Data size (Characters) (Document %d) %d' % (i, len(two_grams))) print('Sample string (Document %d) %s' % (i, two_grams[:50])) def build_dataset(documents): chars = [] # This is going to be a list of lists # Where the outer list denote each document # and the inner lists denote words in a given document data_list = [] for d in documents: chars.extend(d) print('%d Characters found.' % len(chars)) count = [] # Get the bigram sorted by their frequency (Highest comes first) count.extend(collections.Counter(chars).most_common()) # Create an ID for each bigram by giving the current length of the dictionary # And adding that item to the dictionary # Start with 'UNK' that is assigned to too rare words dictionary = dict({ 'UNK': 0}) for char, c in count: # Only add a bigram to dictionary if its frequency is more than 10 if c > 10: dictionary[char] = len(dictionary) unk_count = 0 # Traverse through all the text we have # to replace each string word with the ID of the word for d in documents: data = list() for char in d: # If word is in the dictionary use the word ID, # else use the ID of the special token "UNK" if char in dictionary: index = dictionary[char] else: index = dictionary['UNK'] unk_count += 1 data.append(index) data_list.append(data) reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data_list, count, dictionary, reverse_dictionary global data_list, count, dictionary, reverse_dictionary, vocabulary_size # Print some statistics about data data_list, count, dictionary, reverse_dictionary = build_dataset(documents) print('Most common words (+UNK)', count[:5]) print('Least common words (+UNK)', count[-15:]) print('Sample data', data_list[0][:10]) print('Sample data', data_list[1][:10]) print('Vocabulary: ', len(dictionary)) vocabulary_size = len(dictionary) del documents # To reduce memory. class DataGeneratorOHE(object): def __init__(self, text, batch_size, num_unroll): # Text where a bigram is denoted by its ID self._text = text # Number of bigrams in the text self._text_size = len(self._text) # Number of datapoints in a batch of data self._batch_size = batch_size # Num unroll is the number of steps we unroll the RNN in a single training step # This relates to the truncated backpropagation we discuss in Chapter 6 text self._num_unroll = num_unroll # We break the text in to several segments and the batch of data is sampled by # sampling a single item from a single segment self._segments = self._text_size // self._batch_size self._cursor = [offset * self._segments for offset in range(self._batch_size)] def next_batch(self): ''' Generates a single batch of data ''' # Train inputs (one-hot-encoded) and train outputs (one-hot-encoded) batch_data = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32) batch_labels = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32) # Fill in the batch datapoint by datapoint for b in range(self._batch_size): # If the cursor of a given segment exceeds the segment length # we reset the cursor back to the beginning of that segment if self._cursor[b] + 1 >= self._text_size: self._cursor[b] = b * self._segments # Add the text at the cursor as the input batch_data[b, self._text[self._cursor[b]]] = 1.0 # Add the preceding bigram as the label to be predicted batch_labels[b, self._text[self._cursor[b] + 1]] = 1.0 # Update the cursor self._cursor[b] =






