自然语言处理

2024-03-01 13:25| 来源: 网络整理| 查看: 265

自然语言处理-自学笔记-07 RNN的应用 RNN的应用一对一RNN 一对多RNN 多对一RNN 多对多RNN 用RNN产生文本困惑度：衡量文本生成质量 RNN-CF

RNN的应用

当前输出取决于当前输入以及先前观察到的输入历史。这意味着存在先前观察到的输入序列和当前输入的产生的输出。在实际中可能存在：一个输入序列只有一个输出、一个输入产生一个输出序列、以及一个输入序列产生一个序列大小不同的输出序列。

一对一RNN

在一对一RNN中。当前输入取决于先前观察到的输入。这种RNN适用于每个输入都有输出的问题，但其输出取决于当前输入和导致当前输入的输入历史。这种任务的一个例子是股票市场预测，另一个例子是场景分类。在文本生成问题中，先前预测的单词变为预测下一个单词的输入。

一对多RNN

一对多RNN接受一个输入并输出一个序列。这里假设输入彼此独立，不需要用先前的输入的相关信息来预测当前输入。但是需要循环连接，尽管处理单个输入，但输出是依赖于先前输出值的一系列值。可以用来生成图像标题。

多对一RNN

多对一RNN输入任意长度的序列，产生一个输出。句子分类就是受益于多对一RNN任务。句子的长度任意，输出为某一类别。句子分类的例子为将评论分为正向和负向，根据句子描述的内容对句子进行分类。

多对多RNN

通常根据任意长度的输入产生任意长度的输出。输入和输出都不必具有相同的长度。这个的一个应用是机器翻译。另一个应用是聊天机器人。

用RNN产生文本 from __future__ import print_function import collections import math import numpy as np import os import random import tensorflow as tf import zipfile from matplotlib import pylab from six.moves import range from six.moves.urllib.request import urlretrieve import tensorflow as tf from scipy.sparse import lil_matrix #import nltk #nltk.download() #tokenizers/punkt/PY3/english.pickle url = 'https://www.cs.cmu.edu/~spok/grimmtmp/' # Create a directory if needed dir_name = 'stories' if not os.path.exists(dir_name): os.mkdir(dir_name) def maybe_download(filename): """Download a file if not present""" print('Downloading file: ', dir_name + os.sep + filename) if not os.path.exists(dir_name + os.sep + filename): filename, _ = urlretrieve(url + filename, dir_name + os.sep + filename) else: print('File ', filename, ' already exists.') return filename num_files = 100 filenames = [format(i, '03d') + '.txt' for i in range(1, 101)] for fn in filenames: maybe_download(fn) def read_data(filename): with open(filename) as f: data = tf.compat.as_str(f.read()) data = data.lower() data = list(data) return data documents = [] for i in range(num_files): print('\nProcessing file %s' % os.path.join(dir_name, filenames[i])) chars = read_data(os.path.join(dir_name, filenames[i])) two_grams = [''.join(chars[ch_i:ch_i + 2]) for ch_i in range(0, len(chars) - 2, 2)] documents.append(two_grams) print('Data size (Characters) (Document %d) %d' % (i, len(two_grams))) print('Sample string (Document %d) %s' % (i, two_grams[:50])) def build_dataset(documents): chars = [] # This is going to be a list of lists # Where the outer list denote each document # and the inner lists denote words in a given document data_list = [] for d in documents: chars.extend(d) print('%d Characters found.' % len(chars)) count = [] # Get the bigram sorted by their frequency (Highest comes first) count.extend(collections.Counter(chars).most_common()) # Create an ID for each bigram by giving the current length of the dictionary # And adding that item to the dictionary # Start with 'UNK' that is assigned to too rare words dictionary = dict({ 'UNK': 0}) for char, c in count: # Only add a bigram to dictionary if its frequency is more than 10 if c > 10: dictionary[char] = len(dictionary) unk_count = 0 # Traverse through all the text we have # to replace each string word with the ID of the word for d in documents: data = list() for char in d: # If word is in the dictionary use the word ID, # else use the ID of the special token "UNK" if char in dictionary: index = dictionary[char] else: index = dictionary['UNK'] unk_count += 1 data.append(index) data_list.append(data) reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data_list, count, dictionary, reverse_dictionary global data_list, count, dictionary, reverse_dictionary, vocabulary_size # Print some statistics about data data_list, count, dictionary, reverse_dictionary = build_dataset(documents) print('Most common words (+UNK)', count[:5]) print('Least common words (+UNK)', count[-15:]) print('Sample data', data_list[0][:10]) print('Sample data', data_list[1][:10]) print('Vocabulary: ', len(dictionary)) vocabulary_size = len(dictionary) del documents # To reduce memory. class DataGeneratorOHE(object): def __init__(self, text, batch_size, num_unroll): # Text where a bigram is denoted by its ID self._text = text # Number of bigrams in the text self._text_size = len(self._text) # Number of datapoints in a batch of data self._batch_size = batch_size # Num unroll is the number of steps we unroll the RNN in a single training step # This relates to the truncated backpropagation we discuss in Chapter 6 text self._num_unroll = num_unroll # We break the text in to several segments and the batch of data is sampled by # sampling a single item from a single segment self._segments = self._text_size // self._batch_size self._cursor = [offset * self._segments for offset in range(self._batch_size)] def next_batch(self): ''' Generates a single batch of data ''' # Train inputs (one-hot-encoded) and train outputs (one-hot-encoded) batch_data = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32) batch_labels = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32) # Fill in the batch datapoint by datapoint for b in range(self._batch_size): # If the cursor of a given segment exceeds the segment length # we reset the cursor back to the beginning of that segment if self._cursor[b] + 1 >= self._text_size: self._cursor[b] = b * self._segments # Add the text at the cursor as the input batch_data[b, self._text[self._cursor[b]]] = 1.0 # Add the preceding bigram as the label to be predicted batch_labels[b, self._text[self._cursor[b] + 1]] = 1.0 # Update the cursor self._cursor[b] =

【本文地址】

公司简介

联系我们