
本帖训练一个可以根据姓名判断性别的CNN模型;我使用自己爬取的35万中文姓名进行训练。
使用同样的数据集还可以训练起名字模型,参看:
- TensorFlow练习7: 基于RNN生成古诗词
- https://github.com/tensorflow/models/tree/master/namignizer
- TensorFlow练习13: 制作一个简单的聊天机器人
准备姓名数据集
我上网找了一下,并没有找到现成的中文姓名数据集,额,看来只能自己动手了。
我写了一个简单的Python脚本,爬取了上万中文姓名,格式整理如下:
| 1 2 3 4 5 6 7 8 9 10 11 12 13 | 姓名,性别 安镶怡,女 饶黎明,男 段焙曦,男 苗芯萌,男 覃慧藐,女 芦玥微,女 苏佳琬,女 王旎溪,女 彭琛朗,男 李昊,男 利欣怡,女 # 貌似有很多名字男女通用 | 
如果你需要这个数据集,可以使用邮件或微信联系我。
训练模型
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import tensorflow as tf import numpy as np name_dataset = ‘name.csv’ train_x = [] train_y = [] with open(name_dataset, ‘r’) as f: first_line = True for line in f: if first_line is True: first_line = False continue sample = line.strip().split(‘,’) if len(sample) == 2: train_x.append(sample[0]) if sample[1] == ‘男’: train_y.append([0, 1])  # 男 else: train_y.append([1, 0])  # 女 max_name_length = max([len(name) for name in train_x]) print(“最长名字的字符数: “, max_name_length) max_name_length = 8 # 数据已shuffle #shuffle_indices = np.random.permutation(np.arange(len(train_y))) #train_x = train_x[shuffle_indices] #train_y = train_y[shuffle_indices] # 词汇表(参看聊天机器人练习) counter = 0 vocabulary = {} for name in train_x: counter += 1 tokens = [word for word in name] for word in tokens: if word in vocabulary: vocabulary[word] += 1 else: vocabulary[word] = 1 vocabulary_list = [‘ ‘] + sorted(vocabulary, key=vocabulary.get, reverse=True) print(len(vocabulary_list)) # 字符串转为向量形式 vocab = dict([(x, y) for (y, x) in enumerate(vocabulary_list)]) train_x_vec = [] for name in train_x: name_vec = [] for word in name: name_vec.append(vocab.get(word)) while len(name_vec) < max_name_length: name_vec.append(0) train_x_vec.append(name_vec) ####################################################### input_size = max_name_length num_classes = 2 batch_size = 64 num_batch = len(train_x_vec) // batch_size X = tf.placeholder(tf.int32, [None, input_size]) Y = tf.placeholder(tf.float32, [None, num_classes]) dropout_keep_prob = tf.placeholder(tf.float32) def neural_network(vocabulary_size, embedding_size=128, num_filters=128): # embedding layer with tf.device(‘/cpu:0’), tf.name_scope(“embedding”): W = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], –1.0, 1.0)) embedded_chars = tf.nn.embedding_lookup(W, X) embedded_chars_expanded = tf.expand_dims(embedded_chars, –1) # convolution + maxpool layer filter_sizes = [3,4,5] pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope(“conv-maxpool-%s” % filter_size): filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1)) b = tf.Variable(tf.constant(0.1, shape=[num_filters])) conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding=“VALID”) h = tf.nn.relu(tf.nn.bias_add(conv, b)) pooled = tf.nn.max_pool(h, ksize=[1, input_size – filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding=‘VALID’) pooled_outputs.append(pooled) num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(3, pooled_outputs) h_pool_flat = tf.reshape(h_pool, [–1, num_filters_total]) # dropout with tf.name_scope(“dropout”): h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob) # output with tf.name_scope(“output”): W = tf.get_variable(“W”, shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes])) output = tf.nn.xw_plus_b(h_drop, W, b) return output # 训练 def train_neural_network(): output = neural_network(len(vocabulary_list)) optimizer = tf.train.AdamOptimizer(1e–3) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y)) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for e in range(201): for i in range(num_batch): batch_x = train_x_vec[i*batch_size : (i+1)*batch_size] batch_y = train_y[i*batch_size : (i+1)*batch_size] _, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5}) print(e, i, loss_) # 保存模型 if e % 50 == 0: saver.save(sess, “name2sex.model”, global_step=e) train_neural_network() # 使用训练的模型 def detect_sex(name_list): x = [] for name in name_list: name_vec = [] for word in name: name_vec.append(vocab.get(word)) while len(name_vec) < max_name_length: name_vec.append(0) x.append(name_vec) output = neural_network(len(vocabulary_list)) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: # 恢复前一次训练 ckpt = tf.train.get_checkpoint_state(‘.’) if ckpt != None: print(ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: print(“没找到模型”) predictions = tf.argmax(output, 1) res = sess.run(predictions, {X:x, dropout_keep_prob:1.0}) i = 0 for name in name_list: print(name, ‘女’ if res[i] == 0 else ‘男’) i += 1 detect_sex([“白富美”, “高帅富”, “王婷婷”, “田野”]) | 
执行结果:

服务器又该续费了,如果你要使用DigitalOcean VPS,欢迎使用网页底部的链接注册,你会免费获赠10刀。另外,感谢各位码友的支持。
如要转载,请保持本文完整,并注明作者@斗大的熊猫和本文原始地址: http://blog.topspeedsnail.com/archives/10833
转载请注明:徐自远的乱七八糟小站 » 根据姓名判断性别




 苏公网安备 32021402001397号
苏公网安备 32021402001397号