本帖训练一个可以根据姓名判断性别的CNN模型;我使用自己爬取的35万中文姓名进行训练。
使用同样的数据集还可以训练起名字模型,参看:
- TensorFlow练习7: 基于RNN生成古诗词
- https://github.com/tensorflow/models/tree/master/namignizer
- TensorFlow练习13: 制作一个简单的聊天机器人
准备姓名数据集
我上网找了一下,并没有找到现成的中文姓名数据集,额,看来只能自己动手了。
我写了一个简单的Python脚本,爬取了上万中文姓名,格式整理如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
|
姓名,性别
安镶怡,女
饶黎明,男
段焙曦,男
苗芯萌,男
覃慧藐,女
芦玥微,女
苏佳琬,女
王旎溪,女
彭琛朗,男
李昊,男
利欣怡,女
# 貌似有很多名字男女通用
|
如果你需要这个数据集,可以使用邮件或微信联系我。
训练模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
import tensorflow as tf
import numpy as np
name_dataset = ‘name.csv’
train_x = []
train_y = []
with open(name_dataset, ‘r’) as f:
first_line = True
for line in f:
if first_line is True:
first_line = False
continue
sample = line.strip().split(‘,’)
if len(sample) == 2:
train_x.append(sample[0])
if sample[1] == ‘男’:
train_y.append([0, 1]) # 男
else:
train_y.append([1, 0]) # 女
max_name_length = max([len(name) for name in train_x])
print(“最长名字的字符数: “, max_name_length)
max_name_length = 8
# 数据已shuffle
#shuffle_indices = np.random.permutation(np.arange(len(train_y)))
#train_x = train_x[shuffle_indices]
#train_y = train_y[shuffle_indices]
# 词汇表(参看聊天机器人练习)
counter = 0
vocabulary = {}
for name in train_x:
counter += 1
tokens = [word for word in name]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
vocabulary_list = [‘ ‘] + sorted(vocabulary, key=vocabulary.get, reverse=True)
print(len(vocabulary_list))
# 字符串转为向量形式
vocab = dict([(x, y) for (y, x) in enumerate(vocabulary_list)])
train_x_vec = []
for name in train_x:
name_vec = []
for word in name:
name_vec.append(vocab.get(word))
while len(name_vec) < max_name_length:
name_vec.append(0)
train_x_vec.append(name_vec)
#######################################################
input_size = max_name_length
num_classes = 2
batch_size = 64
num_batch = len(train_x_vec) // batch_size
X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
dropout_keep_prob = tf.placeholder(tf.float32)
def neural_network(vocabulary_size, embedding_size=128, num_filters=128):
# embedding layer
with tf.device(‘/cpu:0’), tf.name_scope(“embedding”):
W = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], –1.0, 1.0))
embedded_chars = tf.nn.embedding_lookup(W, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, –1)
# convolution + maxpool layer
filter_sizes = [3,4,5]
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope(“conv-maxpool-%s” % filter_size):
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding=“VALID”)
h = tf.nn.relu(tf.nn.bias_add(conv, b))
pooled = tf.nn.max_pool(h, ksize=[1, input_size – filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding=‘VALID’)
pooled_outputs.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(3, pooled_outputs)
h_pool_flat = tf.reshape(h_pool, [–1, num_filters_total])
# dropout
with tf.name_scope(“dropout”):
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
# output
with tf.name_scope(“output”):
W = tf.get_variable(“W”, shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]))
output = tf.nn.xw_plus_b(h_drop, W, b)
return output
# 训练
def train_neural_network():
output = neural_network(len(vocabulary_list))
optimizer = tf.train.AdamOptimizer(1e–3)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars)
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for e in range(201):
for i in range(num_batch):
batch_x = train_x_vec[i*batch_size : (i+1)*batch_size]
batch_y = train_y[i*batch_size : (i+1)*batch_size]
_, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})
print(e, i, loss_)
# 保存模型
if e % 50 == 0:
saver.save(sess, “name2sex.model”, global_step=e)
train_neural_network()
# 使用训练的模型
def detect_sex(name_list):
x = []
for name in name_list:
name_vec = []
for word in name:
name_vec.append(vocab.get(word))
while len(name_vec) < max_name_length:
name_vec.append(0)
x.append(name_vec)
output = neural_network(len(vocabulary_list))
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
# 恢复前一次训练
ckpt = tf.train.get_checkpoint_state(‘.’)
if ckpt != None:
print(ckpt.model_checkpoint_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
print(“没找到模型”)
predictions = tf.argmax(output, 1)
res = sess.run(predictions, {X:x, dropout_keep_prob:1.0})
i = 0
for name in name_list:
print(name, ‘女’ if res[i] == 0 else ‘男’)
i += 1
detect_sex([“白富美”, “高帅富”, “王婷婷”, “田野”])
|
执行结果:
服务器又该续费了,如果你要使用DigitalOcean VPS,欢迎使用网页底部的链接注册,你会免费获赠10刀。另外,感谢各位码友的支持。
如要转载,请保持本文完整,并注明作者@斗大的熊猫和本文原始地址: http://blog.topspeedsnail.com/archives/10833
转载请注明:徐自远的乱七八糟小站 » 根据姓名判断性别