TensorFlow 机器学习秘籍第二版：9~11（3）-阿里云开发者社区

TensorFlow 机器学习秘籍第二版：9~11（2）https://developer.aliyun.com/article/1426843

# Make Model Directory
if not os.path.exists(full_model_dir):
    os.makedirs(full_model_dir)
# Make data directory
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

现在我们删除标点符号并将翻译数据拆分为英语和德语句子的单词列表：

print('Loading English-German Data')
# Check for data, if it doesn't exist, download it and save it
if not os.path.isfile(os.path.join(data_dir, data_file)):
    print('Data not found, downloading Eng-Ger sentences from www.manythings.org')
    sentence_url = 'http://www.manythings.org/anki/deu-eng.zip'
    r = requests.get(sentence_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('deu.txt')
    # Format Data
    eng_ger_data = file.decode('utf-8')
    eng_ger_data = eng_ger_data.encode('ascii', errors='ignore')
    eng_ger_data = eng_ger_data.decode().split('\n')
    # Write to file
    with open(os.path.join(data_dir, data_file), 'w') as out_conn:
        for sentence in eng_ger_data:
            out_conn.write(sentence + '\n')
else:
    eng_ger_data = []
    with open(os.path.join(data_dir, data_file), 'r') as in_conn:
        for row in in_conn:
            eng_ger_data.append(row[:-1])
print('Done!')

现在我们删除英语和德语句子的标点符号：

# Remove punctuation
eng_ger_data = [''.join(char for char in sent if char not in punct) for sent in eng_ger_data]
# Split each sentence by tabs 
eng_ger_data = [x.split('\t') for x in eng_ger_data if len(x) >= 1]
[english_sentence, german_sentence] = [list(x) for x in zip(*eng_ger_data)]
english_sentence = [x.lower().split() for x in english_sentence]
german_sentence = [x.lower().split() for x in german_sentence]

为了使用 TensorFlow 中更快的数据管道函数，我们需要以适当的格式将格式化的数据写入磁盘。翻译模型期望的格式如下：

train_prefix.source_suffix = train.en
 train_prefix.target_suffix = train.de

后缀将决定语言（en = English，de = deutsch），前缀决定数据集的类型（训练或测试）：

# We need to write them to separate text files for the text-line-dataset operations.
train_prefix = 'train'
src_suffix = 'en' # English
tgt_suffix = 'de' # Deutsch (German)
source_txt_file = train_prefix + '.' + src_suffix
hparams.add_hparam('src_file', source_txt_file)
target_txt_file = train_prefix + '.' + tgt_suffix
hparams.add_hparam('tgt_file', target_txt_file)
with open(source_txt_file, 'w') as f:
    for sent in english_sentence:
        f.write(' '.join(sent) + '\n')
with open(target_txt_file, 'w') as f:
    for sent in german_sentence:
        f.write(' '.join(sent) + '\n')

接下来，我们需要解析一些（~100）测试句子翻译。我们任意选择大约 100 个句子。然后我们也将它们写入适当的文件：

# Partition some sentences off for testing files
test_prefix = 'test_sent'
hparams.add_hparam('dev_prefix', test_prefix)
hparams.add_hparam('train_prefix', train_prefix)
hparams.add_hparam('test_prefix', test_prefix)
hparams.add_hparam('src', src_suffix)
hparams.add_hparam('tgt', tgt_suffix)
num_sample = 100
total_samples = len(english_sentence)
# Get around 'num_sample's every so often in the src/tgt sentences
ix_sample = [x for x in range(total_samples) if x % (total_samples // num_sample) == 0]
test_src = [' '.join(english_sentence[x]) for x in ix_sample]
test_tgt = [' '.join(german_sentence[x]) for x in ix_sample]
# Write test sentences to file
with open(test_prefix + '.' + src_suffix, 'w') as f:
    for eng_test in test_src:
        f.write(eng_test + '\n')
with open(test_prefix + '.' + tgt_suffix, 'w') as f:
    for ger_test in test_src:
        f.write(ger_test + '\n')

接下来，我们处理英语和德语句子的词汇表。然后我们将词汇表列表保存到适当的文件中：

print('Processing the vocabularies.')
# Process the English Vocabulary
all_english_words = [word for sentence in english_sentence for word in sentence]
all_english_counts = Counter(all_english_words)
eng_word_keys = [x[0] for x in all_english_counts.most_common(vocab_size-3)] # -3 because UNK, S, /S is also in there
eng_vocab2ix = dict(zip(eng_word_keys, range(1, vocab_size)))
eng_ix2vocab = {val: key for key, val in eng_vocab2ix.items()}
english_processed = []
for sent in english_sentence:
    temp_sentence = []
    for word in sent:
        try:
            temp_sentence.append(eng_vocab2ix[word])
        except KeyError:
            temp_sentence.append(0)
    english_processed.append(temp_sentence)
# Process the German Vocabulary
all_german_words = [word for sentence in german_sentence for word in sentence]
all_german_counts = Counter(all_german_words)
ger_word_keys = [x[0] for x in all_german_counts.most_common(vocab_size-3)]
# -3 because UNK, S, /S is also in there
ger_vocab2ix = dict(zip(ger_word_keys, range(1, vocab_size)))
ger_ix2vocab = {val: key for key, val in ger_vocab2ix.items()}
german_processed = []
for sent in german_sentence:
    temp_sentence = []
    for word in sent:
        try:
            temp_sentence.append(ger_vocab2ix[word])
        except KeyError:
            temp_sentence.append(0)
    german_processed.append(temp_sentence)
# Save vocab files for data processing
source_vocab_file = 'vocab' + '.' + src_suffix
hparams.add_hparam('src_vocab_file', source_vocab_file)
eng_word_keys = ['<unk>', '<s>', '</s>'] + eng_word_keys
target_vocab_file = 'vocab' + '.' + tgt_suffix
hparams.add_hparam('tgt_vocab_file', target_vocab_file)
ger_word_keys = ['<unk>', '<s>', '</s>'] + ger_word_keys
# Write out all unique english words
with open(source_vocab_file, 'w') as f:
    for eng_word in eng_word_keys:
        f.write(eng_word + '\n')
# Write out all unique german words
with open(target_vocab_file, 'w') as f:
    for ger_word in ger_word_keys:
        f.write(ger_word + '\n')
# Add vocab size to hyper parameters
hparams.add_hparam('src_vocab_size', vocab_size)
hparams.add_hparam('tgt_vocab_size', vocab_size)
# Add out-directory
out_dir = 'temp/seq2seq/nmt_out'
hparams.add_hparam('out_dir', out_dir)
if not tf.gfile.Exists(out_dir):
    tf.gfile.MakeDirs(out_dir)

接下来，我们将分别创建训练，推断和评估图。首先，我们创建训练图。我们用一个类来做这个并将参数设为namedtuple。此代码来自 NMT TensorFlow 仓库。有关更多信息，请参阅名为model_helper.py的仓库中的文件：

class TrainGraph(collections.namedtuple("TrainGraph", ("graph", "model", "iterator", "skip_count_placeholder"))):
    pass
def create_train_graph(scope=None):
    graph = tf.Graph()
    with graph.as_default():
        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(hparams.src_vocab_file, hparams.tgt_vocab_file,share_vocab=False)
    src_dataset = tf.data.TextLineDataset(hparams.src_file)
    tgt_dataset = tf.data.TextLineDataset(hparams.tgt_file)
    skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)
    iterator = iterator_utils.get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, random_seed=None, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder)
 final_model = model.Model(hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, scope=scope)
 return TrainGraph(graph=graph, model=final_model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
train_graph = create_train_graph()

我们现在创建评估图：

# Create the evaluation graph
class EvalGraph(collections.namedtuple("EvalGraph", ("graph", "model", "src_file_placeholder", "tgt_file_placeholder","iterator"))):
    pass
def create_eval_graph(scope=None):
    graph = tf.Graph()
    with graph.as_default():
        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
            hparams.src_vocab_file, hparams.tgt_vocab_file, hparams.share_vocab)
        src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
        tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
        src_dataset = tf.data.TextLineDataset(src_file_placeholder)
        tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder)
        iterator = iterator_utils.get_iterator(
            src_dataset,
            tgt_dataset,
            src_vocab_table,
            tgt_vocab_table,
            hparams.batch_size,
            sos=hparams.sos,
            eos=hparams.eos,
            random_seed=hparams.random_seed,
            num_buckets=hparams.num_buckets,
            src_max_len=hparams.src_max_len_infer,
            tgt_max_len=hparams.tgt_max_len_infer)
        final_model = model.Model(hparams,
                                  iterator=iterator,
                                  mode=tf.contrib.learn.ModeKeys.EVAL,
                                  source_vocab_table=src_vocab_table,
                                  target_vocab_table=tgt_vocab_table,
                                  scope=scope)
    return EvalGraph(graph=graph,
                     model=final_model,
                     src_file_placeholder=src_file_placeholder,
                     tgt_file_placeholder=tgt_file_placeholder,
                     iterator=iterator)
eval_graph = create_eval_graph()

现在我们对推理图做同样的事情：

# Inference graph
class InferGraph(collections.namedtuple("InferGraph", ("graph","model","src_placeholder", "batch_size_placeholder","iterator"))):
    pass
def create_infer_graph(scope=None):
    graph = tf.Graph()
    with graph.as_default():
        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(hparams.src_vocab_file,hparams.tgt_vocab_file, hparams.share_vocab)
        reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
        src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)
        src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder)
        iterator = iterator_utils.get_infer_iterator(src_dataset,
                                                     src_vocab_table,
                                                     batch_size=batch_size_placeholder,
                                                     eos=hparams.eos,
                                                     src_max_len=hparams.src_max_len_infer)
        final_model = model.Model(hparams,
                                  iterator=iterator,
                                  mode=tf.contrib.learn.ModeKeys.INFER,
                                  source_vocab_table=src_vocab_table,
                                  target_vocab_table=tgt_vocab_table,
                                  reverse_target_vocab_table=reverse_tgt_vocab_table,
                                  scope=scope)
    return InferGraph(graph=graph,
                      model=final_model,
                      src_placeholder=src_placeholder,
                      batch_size_placeholder=batch_size_placeholder,
                      iterator=iterator)
infer_graph = create_infer_graph()

为了在训练期间提供更多说明性输出，我们提供了在训练迭代期间输出的任意源/目标翻译的简短列表：

# Create sample data for evaluation
sample_ix = [25, 125, 240, 450]
sample_src_data = [' '.join(english_sentence[x]) for x in sample_ix]
sample_tgt_data = [' '.join(german_sentence[x]) for x in sample_ix]
print([x for x in zip(sample_src_data, sample_tgt_data)])

接下来，我们加载训练图：

config_proto = utils.get_config_proto()
train_sess = tf.Session(config=config_proto, graph=train_graph.graph)
eval_sess = tf.Session(config=config_proto, graph=eval_graph.graph)
infer_sess = tf.Session(config=config_proto, graph=infer_graph.graph)
# Load the training graph
with train_graph.graph.as_default():
    loaded_train_model, global_step = model_helper.create_or_load_model(train_graph.model,
                                                                        hparams.out_dir,
                                                                        train_sess,
                                                                    "train")
summary_writer = tf.summary.FileWriter(os.path.join(hparams.out_dir, 'Training'), train_graph.graph)

现在我们将评估操作添加到图中：

for metric in hparams.metrics:
    hparams.add_hparam("best_" + metric, 0)
    best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
    hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
    tf.gfile.MakeDirs(best_metric_dir)
eval_output = train.run_full_eval(hparams.out_dir, infer_graph, infer_sess, eval_graph, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data)
eval_results, _, acc_blue_scores = eval_output

现在我们创建初始化操作并初始化图；我们还初始化了一些将更新每次迭代的参数（时间，全局步骤和周期步骤）：

# Training Initialization
last_stats_step = global_step
last_eval_step = global_step
last_external_eval_step = global_step
steps_per_eval = 10 * hparams.steps_per_stats
steps_per_external_eval = 5 * steps_per_eval
avg_step_time = 0.0
step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0
checkpoint_total_count = 0.0
speed, train_ppl = 0.0, 0.0
utils.print_out("# Start step %d, lr %g, %s" %
                (global_step, loaded_train_model.learning_rate.eval(session=train_sess),
                 time.ctime()))
skip_count = hparams.batch_size * hparams.epoch_step
utils.print_out("# Init train iterator, skipping %d elements" % skip_count)
train_sess.run(train_graph.iterator.initializer,
              feed_dict={train_graph.skip_count_placeholder: skip_count})

请注意，默认情况下，训练将每 1,000 次迭代保存模型。如果需要，您可以在超参数中更改此设置。目前，训练此模型并保存最新的五个模型占用大约 2 GB 的硬盘空间。

以下代码将开始模型的训练和评估。训练的重要部分是在循环的最开始（前三分之一）。其余代码专门用于评估，从样本推断和保存模型，如下所示：

# Run training
while global_step < hparams.num_train_steps:
    start_time = time.time()
    try:
        step_result = loaded_train_model.train(train_sess)
        (_, step_loss, step_predict_count, step_summary, global_step, step_word_count,
         batch_size, __, ___) = step_result
        hparams.epoch_step += 1
    except tf.errors.OutOfRangeError:
        # Next Epoch
        hparams.epoch_step = 0
        utils.print_out("# Finished an epoch, step %d. Perform external evaluation" % global_step)
        train.run_sample_decode(infer_graph,
                                infer_sess,
                                hparams.out_dir,
                                hparams,
                                summary_writer,
                                sample_src_data,
                                sample_tgt_data)
        dev_scores, test_scores, _ = train.run_external_eval(infer_graph,
                                                             infer_sess,
                                                             hparams.out_dir,
                                                             hparams,
                                                             summary_writer)
        train_sess.run(train_graph.iterator.initializer, feed_dict={train_graph.skip_count_placeholder: 0})
        continue
    summary_writer.add_summary(step_summary, global_step)
    # Statistics
    step_time += (time.time() - start_time)
    checkpoint_loss += (step_loss * batch_size)
    checkpoint_predict_count += step_predict_count
    checkpoint_total_count += float(step_word_count)
    # print statistics
    if global_step - last_stats_step >= hparams.steps_per_stats:
        last_stats_step = global_step
        avg_step_time = step_time / hparams.steps_per_stats
        train_ppl = utils.safe_exp(checkpoint_loss / checkpoint_predict_count)
        speed = checkpoint_total_count / (1000 * step_time)
        utils.print_out(" global step %d lr %g "
                       "step-time %.2fs wps %.2fK ppl %.2f %s" %
                        (global_step,
                         loaded_train_model.learning_rate.eval(session=train_sess),
                         avg_step_time, speed, train_ppl, train._get_best_results(hparams)))
        if math.isnan(train_ppl):
            break
        # Reset timer and loss.
        step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0
        checkpoint_total_count = 0.0
    if global_step - last_eval_step >= steps_per_eval:
        last_eval_step = global_step
        utils.print_out("# Save eval, global step %d" % global_step)
        utils.add_summary(summary_writer, global_step, "train_ppl", train_ppl)
        # Save checkpoint
        loaded_train_model.saver.save(train_sess, os.path.join(hparams.out_dir, "translate.ckpt"), global_step=global_step)
        # Evaluate on dev/test
        train.run_sample_decode(infer_graph,
                                infer_sess,
                                out_dir,
                                hparams,
                                summary_writer,
                                sample_src_data,
                                sample_tgt_data)
        dev_ppl, test_ppl = train.run_internal_eval(eval_graph,
                                                    eval_sess,
                                                    out_dir,
                                                    hparams,
                                                    summary_writer)
    if global_step - last_external_eval_step >= steps_per_external_eval:
        last_external_eval_step = global_step
        # Save checkpoint
        loaded_train_model.saver.save(train_sess, os.path.join(hparams.out_dir, "translate.ckpt"), global_step=global_step)
        train.run_sample_decode(infer_graph,
                                infer_sess,
                                out_dir,
                                hparams,
                                summary_writer,
                                sample_src_data,
                                sample_tgt_data)
        dev_scores, test_scores, _ = train.run_external_eval(infer_graph,
                                                             infer_sess,
                                                             out_dir,
                                                             hparams,
                                                             summary_writer)

工作原理

对于这个秘籍，我们使用 TensorFlow 内置的序列到序列模型从英语翻译成德语。

由于我们没有为我们的测试句子提供完美的翻译，因此还有改进的余地。如果我们训练时间更长，并且可能组合一些桶（每个桶中有更多的训练数据），我们可能能够改进我们的翻译。

在 ManyThings 网站上托管了其他类似的双语句子数据集。您可以随意替换任何吸引您的语言数据集。

训练 Siamese RNN 相似性度量

与许多其他模型相比，RNN 模型的一个重要特性是它们可以处理各种长度的序列。利用这一点，以及它们可以推广到之前未见过的序列这一事实，我们可以创建一种方法来衡量输入的相似序列是如何相互作用的。在这个秘籍中，我们将训练一个 Siamese 相似性 RNN 来测量地址之间的相似性以进行记录匹配。

准备

在本文中，我们将构建一个双向 RNN 模型，该模型将输入到一个完全连接的层，该层输出一个固定长度的数值向量。我们为两个输入地址创建双向 RNN 层，并将输出馈送到完全连接的层，该层输出固定长度的数字向量（长度 100）。然后我们将两个向量输出与余弦距离进行比较，余弦距离在 -1 和 1 之间。我们将输入数据表示为与目标 1 相似，并且目标为 -1。余弦距离的预测只是输出的符号（负值表示不相似，正表示相似）。我们可以使用此网络通过从查询地址获取在余弦距离上得分最高的参考地址来进行记录匹配。

请参阅以下网络架构图：

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eTvQzct2-1681566911076)(https://gitcode.net/apachecn/apachecn-dl-zh/-/raw/master/docs/tf-ml-cookbook-2e-zh/img/3ab9a414-bf14-4bef-a6b5-77deef75eea6.png)]

图 8：Siamese RNN 相似性模型架构

这个模型的优点还在于它接受以前没有见过的输入，并且可以将它们与 -1 到 1 的输出进行比较。我们将通过选择模型之前未见过的测试地址在代码中显示它并查看它是否可以匹配到类似的地址。

操作步骤

我们首先加载必要的库并启动图会话：

import os 
import random 
import string 
import numpy as np 
import matplotlib.pyplot as plt 
import tensorflow as tf 
sess = tf.Session()

我们现在设置模型参数如下：

batch_size = 200 
n_batches = 300 
max_address_len = 20 
margin = 0.25 
num_features = 50 
dropout_keep_prob = 0.8

接下来，我们创建 Siamese RNN 相似性模型类，如下所示：

def snn(address1, address2, dropout_keep_prob, 
        vocab_size, num_features, input_length): 
    # Define the Siamese double RNN with a fully connected layer at the end 
    def Siamese_nn(input_vector, num_hidden): 
        cell_unit = tf.nn.rnn_cell.BasicLSTMCell 
        # Forward direction cell 
        lstm_forward_cell = cell_unit(num_hidden, forget_bias=1.0) 
        lstm_forward_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_forward_cell, output_keep_prob=dropout_keep_prob) 
        # Backward direction cell 
        lstm_backward_cell = cell_unit(num_hidden, forget_bias=1.0) 
        lstm_backward_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_backward_cell, output_keep_prob=dropout_keep_prob) 
        # Split title into a character sequence 
        input_embed_split = tf.split(1, input_length, input_vector) 
        input_embed_split = [tf.squeeze(x, squeeze_dims=[1]) for x in input_embed_split] 
        # Create bidirectional layer 
        outputs, _, _ = tf.nn.bidirectional_rnn(lstm_forward_cell, 
                                                lstm_backward_cell, 
                                                input_embed_split, 
                                                dtype=tf.float32) 
        # Average The output over the sequence 
        temporal_mean = tf.add_n(outputs) / input_length 
        # Fully connected layer 
        output_size = 10 
        A = tf.get_variable(name="A", shape=[2*num_hidden, output_size], 
                            dtype=tf.float32, 
                            initializer=tf.random_normal_initializer(stddev=0.1)) 
        b = tf.get_variable(name="b", shape=[output_size], dtype=tf.float32, 
                            initializer=tf.random_normal_initializer(stddev=0.1)) 
        final_output = tf.matmul(temporal_mean, A) + b 
        final_output = tf.nn.dropout(final_output, dropout_keep_prob) 
        return(final_output) 
    with tf.variable_scope("Siamese") as scope: 
            output1 = Siamese_nn(address1, num_features) 
            # Declare that we will use the same variables on the second string 
            scope.reuse_variables() 
            output2 = Siamese_nn(address2, num_features) 
    # Unit normalize the outputs 
    output1 = tf.nn.l2_normalize(output1, 1) 
    output2 = tf.nn.l2_normalize(output2, 1) 
    # Return cosine distance 
    #   in this case, the dot product of the norms is the same. 
    dot_prod = tf.reduce_sum(tf.mul(output1, output2), 1) 
    return dot_prod

请注意，使用变量范围在两个地址输入的 Siamese 网络的两个部分之间共享参数。另外，请注意，余弦距离是通过归一化向量的点积来实现的。

现在我们将声明我们的预测函数，它只是余弦距离的符号，如下所示：

def get_predictions(scores): 
    predictions = tf.sign(scores, name="predictions") 
    return predictions

现在我们将如前所述声明我们的loss函数。请记住，我们希望为误差留下边距（类似于 SVM 模型）。我们还将有一个真正的积极和真正的消极的损失期限。使用以下代码进行损失：

def loss(scores, y_target, margin): 
    # Calculate the positive losses 
    pos_loss_term = 0.25 * tf.square(tf.sub(1., scores)) 
    pos_mult = tf.cast(y_target, tf.float32) 
    # Make sure positive losses are on similar strings 
    positive_loss = tf.mul(pos_mult, pos_loss_term) 
    # Calculate negative losses, then make sure on dissimilar strings 
    neg_mult = tf.sub(1., tf.cast(y_target, tf.float32)) 
    negative_loss = neg_mult*tf.square(scores) 
    # Combine similar and dissimilar losses 
    loss = tf.add(positive_loss, negative_loss) 
    # Create the margin term.  This is when the targets are 0, and the scores are less than m, return 0\. 
    # Check if target is zero (dissimilar strings) 
    target_zero = tf.equal(tf.cast(y_target, tf.float32), 0.) 
    # Check if cosine outputs is smaller than margin 
    less_than_margin = tf.less(scores, margin) 
    # Check if both are true 
    both_logical = tf.logical_and(target_zero, less_than_margin) 
    both_logical = tf.cast(both_logical, tf.float32) 
    # If both are true, then multiply by (1-1)=0\. 
    multiplicative_factor = tf.cast(1\. - both_logical, tf.float32) 
    total_loss = tf.mul(loss, multiplicative_factor) 
    # Average loss over batch 
    avg_loss = tf.reduce_mean(total_loss) 
    return avg_loss

我们声明accuracy函数如下：

def accuracy(scores, y_target): 
    predictions = get_predictions(scores) 
    correct_predictions = tf.equal(predictions, y_target) 
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) 
    return accuracy

我们将通过在地址中创建拼写错误来创建类似的地址。我们将这些地址（参考地址和拼写错误地址）表示为类似：

def create_typo(s): 
    rand_ind = random.choice(range(len(s))) 
    s_list = list(s) 
    s_list[rand_ind]=random.choice(string.ascii_lowercase + '0123456789') 
    s = ''.join(s_list) 
    return s

我们将生成的数据将是街道号码，street_names和街道后缀的随机组合。名称和后缀来自以下列表：

street_names = ['abbey', 'baker', 'canal', 'donner', 'elm', 'fifth', 'grandvia', 'hollywood', 'interstate', 'jay', 'kings'] 
street_types = ['rd', 'st', 'ln', 'pass', 'ave', 'hwy', 'cir', 'dr', 'jct']

我们生成测试查询和引用如下：

test_queries = ['111 abbey ln', '271 doner cicle', 
                '314 king avenue', 'tensorflow is fun'] 
test_references = ['123 abbey ln', '217 donner cir', '314 kings ave', '404 hollywood st', 'tensorflow is so fun']

请注意，最后一个查询和引用不是模型之前会看到的地址，但我们希望它们将是模型最终看到的最相似的地址。

我们现在将定义如何生成一批数据。我们的批量数据将是 50% 类似的地址（参考地址和拼写错误地址）和 50% 不同的地址。我们通过占用地址列表的一半并将目标移动一个位置（使用numpy.roll()函数）来生成不同的地址：

def get_batch(n): 
    # Generate a list of reference addresses with similar addresses that have 
    # a typo. 
    numbers = [random.randint(1, 9999) for i in range(n)] 
    streets = [random.choice(street_names) for i in range(n)] 
    street_suffs = [random.choice(street_types) for i in range(n)] 
    full_streets = [str(w) + ' ' + x + ' ' + y for w,x,y in zip(numbers, streets, street_suffs)] 
    typo_streets = [create_typo(x) for x in full_streets] 
    reference = [list(x) for x in zip(full_streets, typo_streets)] 
    # Shuffle last half of them for training on dissimilar addresses 
    half_ix = int(n/2) 
    bottom_half = reference[half_ix:] 
    true_address = [x[0] for x in bottom_half] 
    typo_address = [x[1] for x in bottom_half] 
    typo_address = list(np.roll(typo_address, 1)) 
    bottom_half = [[x,y] for x,y in zip(true_address, typo_address)] 
    reference[half_ix:] = bottom_half 
    # Get target similarities (1's for similar, -1's for non-similar) 
    target = [1]*(n-half_ix) + [-1]*half_ix 
    reference = [[x,y] for x,y in zip(reference, target)] 
    return reference

接下来，我们定义地址词汇表并指定如何将地址热编码为索引：

vocab_chars = string.ascii_lowercase + '0123456789 ' 
vocab2ix_dict = {char:(ix+1) for ix, char in enumerate(vocab_chars)} 
vocab_length = len(vocab_chars) + 1 
# Define vocab one-hot encoding 
def address2onehot(address, 
                   vocab2ix_dict = vocab2ix_dict, 
                   max_address_len = max_address_len): 
    # translate address string into indices 
    address_ix = [vocab2ix_dict[x] for x in list(address)] 
    # Pad or crop to max_address_len 
    address_ix = (address_ix + [0]*max_address_len)[0:max_address_len] 
    return address_ix

处理完词汇后，我们将开始声明我们的模型占位符和嵌入查找。对于嵌入查找，我们将使用单一矩阵

TensorFlow 机器学习秘籍第二版：9~11（4）https://developer.aliyun.com/article/1426845

TensorFlow 机器学习秘籍第二版：9~11（3）

工作原理

更多

训练 Siamese RNN 相似性度量

准备

操作步骤

热门文章

最新文章

相关课程

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

TensorFlow 机器学习秘籍第二版：9~11（3）

工作原理

更多

训练 Siamese RNN 相似性度量

准备

操作步骤

热门文章

最新文章

相关课程

相关电子书

相关实验场景