|
- import os
- import numpy as np
- import tensorflow as tf
- import time
- import argparse
- from tqdm import tqdm
- from sklearn import metrics
- import horovod.tensorflow as hvd
-
-
- def pop_env():
- for k in ['https_proxy', 'http_proxy']:
- if k in os.environ:
- os.environ.pop(k)
-
-
- pop_env()
-
- # horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
- # horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
- # if using multi nodes setting in conda, need to modify /etc/bash.bashrc
- # we can also use mpirun (default gloo):
- # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
- # -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
-
-
- def train_criteo(model, args):
- hvd.init()
-
- def get_current_shard(data):
- part_size = data.shape[0] // hvd.size()
- start = part_size * hvd.rank()
- end = start + part_size if hvd.rank() != hvd.size() - \
- 1 else data.shape[0]
- return data[start:end]
-
- if args.all:
- from models.load_data import process_all_criteo_data
- dense, sparse, all_labels = process_all_criteo_data()
- dense_feature = get_current_shard(dense[0])
- sparse_feature = get_current_shard(sparse[0])
- labels = get_current_shard(all_labels[0])
- val_dense = get_current_shard(dense[1])
- val_sparse = get_current_shard(sparse[1])
- val_labels = get_current_shard(all_labels[1])
- else:
- from models.load_data import process_sampled_criteo_data
- dense_feature, sparse_feature, labels = process_sampled_criteo_data()
- dense_feature = get_current_shard(dense_feature)
- sparse_feature = get_current_shard(sparse_feature)
- labels = get_current_shard(labels)
-
- batch_size = 128
- dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
- sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
- y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
-
- loss, y, opt = model(dense_input, sparse_input, y_)
- global_step = tf.train.get_or_create_global_step()
- # here in DistributedOptimizer by default all tensor are reduced on GPU
- # can use device_sparse=xxx, device_dense=xxx to modify
- # if using device_sparse='/cpu:0', the performance degrades
- train_op = hvd.DistributedOptimizer(
- opt).minimize(loss, global_step=global_step)
-
- gpu_options = tf.compat.v1.GPUOptions(
- allow_growth=True, visible_device_list=str(hvd.local_rank()))
- # here horovod default use gpu to initialize, which will cause OOM
- hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
- sess = tf.compat.v1.train.MonitoredTrainingSession(
- hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
-
- my_feed_dict = {
- dense_input: np.empty(shape=(batch_size, 13)),
- sparse_input: np.empty(shape=(batch_size, 26)),
- y_: np.empty(shape=(batch_size, 1)),
- }
-
- if args.all:
- raw_log_file = './logs/tf_hvd_%s_%d.log' % (
- args.model, hvd.local_rank())
- print('Processing all data, log to', raw_log_file)
- log_file = open(raw_log_file, 'w')
- iterations = dense_feature.shape[0] // batch_size
- total_epoch = 400
- start_index = 0
- for ep in range(total_epoch):
- print("epoch %d" % ep)
- st_time = time.time()
- train_loss, train_acc, train_auc = [], [], []
- for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
- my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
- my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
- my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
- start_index += batch_size
- if start_index + batch_size > dense_feature.shape[0]:
- start_index = 0
- loss_val = sess.run([loss, y, y_, train_op],
- feed_dict=my_feed_dict)
- pred_val = loss_val[1]
- true_val = loss_val[2]
- acc_val = np.equal(
- true_val,
- pred_val > 0.5)
- train_loss.append(loss_val[0])
- train_acc.append(acc_val)
- train_auc.append(metrics.roc_auc_score(true_val, pred_val))
- tra_accuracy = np.mean(train_acc)
- tra_loss = np.mean(train_loss)
- tra_auc = np.mean(train_auc)
- en_time = time.time()
- train_time = en_time - st_time
- printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
- % (tra_loss, tra_accuracy, tra_auc, train_time)
- print(printstr)
- log_file.write(printstr + '\n')
- log_file.flush()
-
- else:
- iterations = dense_feature.shape[0] // batch_size
-
- epoch = 50
- for ep in range(epoch):
- print('epoch', ep)
- if ep == 5:
- start = time.time()
- ep_st = time.time()
- train_loss = []
- train_acc = []
- for idx in range(iterations):
- start_index = idx * batch_size
- my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
- my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
- my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
-
- loss_val = sess.run([loss, y, y_, train_op],
- feed_dict=my_feed_dict)
- pred_val = loss_val[1]
- true_val = loss_val[2]
- if pred_val.shape[1] == 1: # for criteo case
- acc_val = np.equal(
- true_val,
- pred_val > 0.5)
- else:
- acc_val = np.equal(
- np.argmax(pred_val, 1),
- np.argmax(true_val, 1)).astype(np.float32)
- train_loss.append(loss_val[0])
- train_acc.append(acc_val)
- tra_accuracy = np.mean(train_acc)
- tra_loss = np.mean(train_loss)
- ep_en = time.time()
- print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
- % (tra_loss, tra_accuracy, ep_en - ep_st))
- print('all time:', (time.time() - start))
-
-
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument("--model", type=str, required=True,
- help="model to be tested")
- parser.add_argument("--all", action="store_true",
- help="whether to use all data")
- args = parser.parse_args()
- raw_model = args.model
- import tf_models
- model = eval('tf_models.' + raw_model)
- dataset = raw_model.split('_')[-1]
- print('Model:', raw_model)
- train_criteo(model, args)
-
-
- if __name__ == '__main__':
- main()
|