|
|
|
@@ -1,853 +0,0 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"2.2.0\n", |
|
|
|
"sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)\n", |
|
|
|
"matplotlib 3.3.4\n", |
|
|
|
"numpy 1.19.5\n", |
|
|
|
"pandas 1.1.5\n", |
|
|
|
"sklearn 0.24.2\n", |
|
|
|
"tensorflow 2.2.0\n", |
|
|
|
"tensorflow.keras 2.3.0-tf\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"import matplotlib as mpl\n", |
|
|
|
"import matplotlib.pyplot as plt\n", |
|
|
|
"%matplotlib inline\n", |
|
|
|
"import numpy as np\n", |
|
|
|
"import sklearn\n", |
|
|
|
"import pandas as pd\n", |
|
|
|
"import os\n", |
|
|
|
"import sys\n", |
|
|
|
"import time\n", |
|
|
|
"import tensorflow as tf\n", |
|
|
|
"\n", |
|
|
|
"from tensorflow import keras\n", |
|
|
|
"\n", |
|
|
|
"print(tf.__version__)\n", |
|
|
|
"print(sys.version_info)\n", |
|
|
|
"for module in mpl, np, pd, sklearn, tf, keras:\n", |
|
|
|
" print(module.__name__, module.__version__)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"from sklearn.datasets import fetch_california_housing\n", |
|
|
|
"\n", |
|
|
|
"housing = fetch_california_housing()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 3, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"(11610, 8) (11610,)\n", |
|
|
|
"(3870, 8) (3870,)\n", |
|
|
|
"(5160, 8) (5160,)\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from sklearn.model_selection import train_test_split\n", |
|
|
|
"\n", |
|
|
|
"x_train_all, x_test, y_train_all, y_test = train_test_split(\n", |
|
|
|
" housing.data, housing.target, random_state = 7)\n", |
|
|
|
"x_train, x_valid, y_train, y_valid = train_test_split(\n", |
|
|
|
" x_train_all, y_train_all, random_state = 11)\n", |
|
|
|
"print(x_train.shape, y_train.shape)\n", |
|
|
|
"print(x_valid.shape, y_valid.shape)\n", |
|
|
|
"print(x_test.shape, y_test.shape)\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 4, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"from sklearn.preprocessing import StandardScaler\n", |
|
|
|
"\n", |
|
|
|
"scaler = StandardScaler()\n", |
|
|
|
"x_train_scaled = scaler.fit_transform(x_train)\n", |
|
|
|
"x_valid_scaled = scaler.transform(x_valid)\n", |
|
|
|
"x_test_scaled = scaler.transform(x_test)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 6, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"!rm -rf generate_csv" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 5, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"tf01-dataset_basic_api.ipynb tf03-tfrecord_basic_api.ipynb\r\n", |
|
|
|
"tf02_data_generate_csv.ipynb tf04_data_generate_tfrecord.ipynb\r\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"!ls" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 7, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"numpy.ndarray" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 7, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"type(x_train_scaled)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 8, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"['.ipynb_checkpoints',\n", |
|
|
|
" 'tf02_data_generate_csv.ipynb',\n", |
|
|
|
" 'tf04_data_generate_tfrecord.ipynb',\n", |
|
|
|
" 'tf03-tfrecord_basic_api.ipynb',\n", |
|
|
|
" 'tf01-dataset_basic_api.ipynb']" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 8, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"os.listdir()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 14, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"0 [0 1 2 3 4]\n", |
|
|
|
"1 [5 6 7 8 9]\n", |
|
|
|
"2 [10 11 12 13 14]\n", |
|
|
|
"3 [15 16 17 18 19]\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#为了把数据分好\n", |
|
|
|
"for file_idx, row_indices in enumerate(np.array_split(np.arange(20), 4)):\n", |
|
|
|
" print(file_idx,row_indices)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 10, |
|
|
|
"metadata": { |
|
|
|
"scrolled": true |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue\n", |
|
|
|
"--------------------------------------------------\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#下面要把特征工程后的数据存为csv文件\n", |
|
|
|
"output_dir = \"generate_csv\"\n", |
|
|
|
"if not os.path.exists(output_dir):\n", |
|
|
|
" os.mkdir(output_dir)\n", |
|
|
|
"\n", |
|
|
|
"#save_to_csv是工作可以直接复用的\n", |
|
|
|
"def save_to_csv(output_dir, data, name_prefix,\n", |
|
|
|
" header=None, n_parts=10):\n", |
|
|
|
" #生成文件名 格式generate_csv/{}_{:02d}.csv\n", |
|
|
|
" path_format = os.path.join(output_dir, \"{}_{:02d}.csv\") \n", |
|
|
|
" filenames = []\n", |
|
|
|
" #把数据分为n_parts部分,写到文件中去\n", |
|
|
|
" for file_idx, row_indices in enumerate(\n", |
|
|
|
" np.array_split(np.arange(len(data)), n_parts)):\n", |
|
|
|
" #print(file_idx,row_indices)\n", |
|
|
|
" #生成子文件名\n", |
|
|
|
" part_csv = path_format.format(name_prefix, file_idx)\n", |
|
|
|
" filenames.append(part_csv) #文件名添加到列表\n", |
|
|
|
" with open(part_csv, \"w\", encoding=\"utf-8\") as f:\n", |
|
|
|
" #先写头部\n", |
|
|
|
" if header is not None:\n", |
|
|
|
" f.write(header + \"\\n\")\n", |
|
|
|
" for row_index in row_indices:\n", |
|
|
|
" #把字符串化后的每个字符串用逗号拼接起来\n", |
|
|
|
" f.write(\",\".join(\n", |
|
|
|
" [repr(col) for col in data[row_index]]))\n", |
|
|
|
" f.write('\\n')\n", |
|
|
|
" return filenames\n", |
|
|
|
"#np.c_把x和y合并起来,按轴1合并\n", |
|
|
|
"train_data = np.c_[x_train_scaled, y_train]\n", |
|
|
|
"valid_data = np.c_[x_valid_scaled, y_valid]\n", |
|
|
|
"test_data = np.c_[x_test_scaled, y_test]\n", |
|
|
|
"#头部,特征,也有目标\n", |
|
|
|
"header_cols = housing.feature_names + [\"MidianHouseValue\"]\n", |
|
|
|
"#把列表变为字符串\n", |
|
|
|
"header_str = \",\".join(header_cols)\n", |
|
|
|
"print(header_str)\n", |
|
|
|
"print('-'*50)\n", |
|
|
|
"train_filenames = save_to_csv(output_dir, train_data, \"train\",\n", |
|
|
|
" header_str, n_parts=20)\n", |
|
|
|
"valid_filenames = save_to_csv(output_dir, valid_data, \"valid\",\n", |
|
|
|
" header_str, n_parts=10)\n", |
|
|
|
"test_filenames = save_to_csv(output_dir, test_data, \"test\",\n", |
|
|
|
" header_str, n_parts=10)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 12, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"temp_array=np.array([[1,2,3],[4,5,6]])\n", |
|
|
|
"np.savetxt(\"temp.csv\",temp_array) #savetxt会自动将整型数或者浮点数转为字符串存储" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 13, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"1.000000000000000000e+00 2.000000000000000000e+00 3.000000000000000000e+00\r\n", |
|
|
|
"4.000000000000000000e+00 5.000000000000000000e+00 6.000000000000000000e+00\r\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"!cat temp.csv" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 14, |
|
|
|
"metadata": { |
|
|
|
"collapsed": true |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"['generate_csv/train_00.csv', 'generate_csv/train_01.csv', 'generate_csv/train_02.csv', 'generate_csv/train_03.csv', 'generate_csv/train_04.csv', 'generate_csv/train_05.csv', 'generate_csv/train_06.csv', 'generate_csv/train_07.csv', 'generate_csv/train_08.csv', 'generate_csv/train_09.csv', 'generate_csv/train_10.csv', 'generate_csv/train_11.csv', 'generate_csv/train_12.csv', 'generate_csv/train_13.csv', 'generate_csv/train_14.csv', 'generate_csv/train_15.csv', 'generate_csv/train_16.csv', 'generate_csv/train_17.csv', 'generate_csv/train_18.csv', 'generate_csv/train_19.csv']\n", |
|
|
|
"train filenames:\n", |
|
|
|
"['generate_csv/train_00.csv',\n", |
|
|
|
" 'generate_csv/train_01.csv',\n", |
|
|
|
" 'generate_csv/train_02.csv',\n", |
|
|
|
" 'generate_csv/train_03.csv',\n", |
|
|
|
" 'generate_csv/train_04.csv',\n", |
|
|
|
" 'generate_csv/train_05.csv',\n", |
|
|
|
" 'generate_csv/train_06.csv',\n", |
|
|
|
" 'generate_csv/train_07.csv',\n", |
|
|
|
" 'generate_csv/train_08.csv',\n", |
|
|
|
" 'generate_csv/train_09.csv',\n", |
|
|
|
" 'generate_csv/train_10.csv',\n", |
|
|
|
" 'generate_csv/train_11.csv',\n", |
|
|
|
" 'generate_csv/train_12.csv',\n", |
|
|
|
" 'generate_csv/train_13.csv',\n", |
|
|
|
" 'generate_csv/train_14.csv',\n", |
|
|
|
" 'generate_csv/train_15.csv',\n", |
|
|
|
" 'generate_csv/train_16.csv',\n", |
|
|
|
" 'generate_csv/train_17.csv',\n", |
|
|
|
" 'generate_csv/train_18.csv',\n", |
|
|
|
" 'generate_csv/train_19.csv']\n", |
|
|
|
"valid filenames:\n", |
|
|
|
"['generate_csv/valid_00.csv',\n", |
|
|
|
" 'generate_csv/valid_01.csv',\n", |
|
|
|
" 'generate_csv/valid_02.csv',\n", |
|
|
|
" 'generate_csv/valid_03.csv',\n", |
|
|
|
" 'generate_csv/valid_04.csv',\n", |
|
|
|
" 'generate_csv/valid_05.csv',\n", |
|
|
|
" 'generate_csv/valid_06.csv',\n", |
|
|
|
" 'generate_csv/valid_07.csv',\n", |
|
|
|
" 'generate_csv/valid_08.csv',\n", |
|
|
|
" 'generate_csv/valid_09.csv']\n", |
|
|
|
"test filenames:\n", |
|
|
|
"['generate_csv/test_00.csv',\n", |
|
|
|
" 'generate_csv/test_01.csv',\n", |
|
|
|
" 'generate_csv/test_02.csv',\n", |
|
|
|
" 'generate_csv/test_03.csv',\n", |
|
|
|
" 'generate_csv/test_04.csv',\n", |
|
|
|
" 'generate_csv/test_05.csv',\n", |
|
|
|
" 'generate_csv/test_06.csv',\n", |
|
|
|
" 'generate_csv/test_07.csv',\n", |
|
|
|
" 'generate_csv/test_08.csv',\n", |
|
|
|
" 'generate_csv/test_09.csv']\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#看下生成文件的文件名\n", |
|
|
|
"print(train_filenames)\n", |
|
|
|
"import pprint #为了打印美观性\n", |
|
|
|
"print(\"train filenames:\")\n", |
|
|
|
"pprint.pprint(train_filenames)\n", |
|
|
|
"print(\"valid filenames:\")\n", |
|
|
|
"pprint.pprint(valid_filenames)\n", |
|
|
|
"print(\"test filenames:\")\n", |
|
|
|
"pprint.pprint(test_filenames)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 16, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 1. filename -> dataset\n", |
|
|
|
"# 2. read file -> dataset -> datasets -> merge\n", |
|
|
|
"# 3. parse csv\n", |
|
|
|
"#list_files把文件名搞为一个dataset\n", |
|
|
|
"# list_files默认行为是按不确定的随机混排顺序返回文件名\n", |
|
|
|
"filename_dataset = tf.data.Dataset.list_files(train_filenames)\n", |
|
|
|
"for filename in filename_dataset:\n", |
|
|
|
" print(filename)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 17, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"filename_mydataset=tf.data.Dataset.from_tensor_slices(train_filenames)\n", |
|
|
|
"filename_mydataset=filename_mydataset.repeat(1)\n", |
|
|
|
"for i in filename_mydataset:\n", |
|
|
|
" print(i)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# 把数据从文件中拿出来" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 24, |
|
|
|
"metadata": { |
|
|
|
"scrolled": false |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"tf.Tensor(b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.2980728090942217,0.3522616607867429,-0.10920507530549702,-0.25055520947444,-0.034064024638222286,-0.006034004264459185,1.080554840130013,-1.0611381656679573,1.514', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.8757754235423053,1.874166156711919,-0.9487499555702599,-0.09657184824705009,-0.7163432355284542,-0.07790191228558485,0.9825753570271144,-1.4206678547327694,2.75', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'0.15782311132800697,0.43236189741438374,0.3379948076652917,-0.015880306122244434,-0.3733890577139493,-0.05305245634489608,0.8006134598360177,-1.2359095422966828,3.169', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'2.2878417437355094,-1.8905449647872008,0.6607106467795992,-0.14964778023694128,-0.06672632728722275,0.44788055801575993,-0.5337737862320228,0.5667323709310584,3.59', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.2223565745313433,1.393564736946074,0.02991299565857307,0.0801452044790158,-0.509481985418118,-0.06238599304952824,-0.86503775291325,0.8613469772480595,2.0', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.03058829290446139,-0.9293421252555106,0.2596214817762415,-0.00601274044096368,-0.5004091235711734,-0.030779867916061836,1.5984463936739026,-1.8151518191233238,1.598', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'1.9063832474401923,0.5124621340420246,0.44758280183798754,-0.276721775345798,-0.6310583341671753,-0.07081146722873086,-0.7064043040799849,0.7464972154634646,5.00001', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'0.29422955783115173,1.874166156711919,0.004626028663628252,-0.28479278487900694,-0.5602900117610076,-0.1196496378702887,1.3558305307524392,-0.9512818717870428,1.625', shape=(), dtype=string)\n", |
|
|
|
"tf.Tensor(b'0.7751155655229017,1.874166156711919,0.15645971958808144,-0.18905190538070707,-0.6292437617977863,-0.08791603438866835,-0.7483955111240856,0.5717258388347319,4.851', shape=(), dtype=string)\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#一访问list_files的dataset对象就随机了文件顺序\n", |
|
|
|
"# for filename in filename_dataset:\n", |
|
|
|
"# print(filename)\n", |
|
|
|
"n_readers = 5\n", |
|
|
|
"dataset = filename_mydataset.interleave(\n", |
|
|
|
" #前面1行是header\n", |
|
|
|
"# lambda filename: tf.data.TextLineDataset(filename),\n", |
|
|
|
" #不带header,把特征名字去掉\n", |
|
|
|
" lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", |
|
|
|
" cycle_length = n_readers, #cycle_length和block_length增加获取了数据的随机性\n", |
|
|
|
" block_length=2\n", |
|
|
|
")\n", |
|
|
|
"for line in dataset.take(15):\n", |
|
|
|
" print(line)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# 把每一行数据切分为对应类型" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 18, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#parse csv 解析csv,通过decode_csv\n", |
|
|
|
"# tf.io.decode_csv(str, record_defaults)\n", |
|
|
|
"\n", |
|
|
|
"sample_str = '1,2,3,4,5'\n", |
|
|
|
"record_defaults = [\n", |
|
|
|
" tf.constant(0, dtype=tf.int32),\n", |
|
|
|
" 0,\n", |
|
|
|
" np.nan,\n", |
|
|
|
" \"hello1\",\n", |
|
|
|
" tf.constant([])#没有固定类型,默认是float32\n", |
|
|
|
"]\n", |
|
|
|
"#sample_str数据格式化,按照record_defaults进行处理\n", |
|
|
|
"parsed_fields = tf.io.decode_csv(sample_str, record_defaults)\n", |
|
|
|
"print(parsed_fields)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 19, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"[<tf.Tensor: shape=(), dtype=int32, numpy=0>,\n", |
|
|
|
" <tf.Tensor: shape=(), dtype=int32, numpy=0>,\n", |
|
|
|
" <tf.Tensor: shape=(), dtype=float32, numpy=nan>,\n", |
|
|
|
" <tf.Tensor: shape=(), dtype=string, numpy=b'hello1'>,\n", |
|
|
|
" <tf.Tensor: shape=(), dtype=float32, numpy=1.0>]" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 19, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#我们传一个空的字符串测试\n", |
|
|
|
"#最后一个为1是可以转换的\n", |
|
|
|
"try:\n", |
|
|
|
" parsed_fields = tf.io.decode_csv(',,,,1', record_defaults)\n", |
|
|
|
"except tf.errors.InvalidArgumentError as ex:\n", |
|
|
|
" print(ex)\n", |
|
|
|
"parsed_fields" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 20, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#我们给的值过多的情况\n", |
|
|
|
"try:\n", |
|
|
|
" parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,', record_defaults)\n", |
|
|
|
"except tf.errors.InvalidArgumentError as ex:\n", |
|
|
|
" print(ex)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 21, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"(<tf.Tensor: shape=(8,), dtype=float32, numpy=\n", |
|
|
|
" array([-0.9868721 , 0.8328631 , -0.18684709, -0.1488895 , -0.45323023,\n", |
|
|
|
" -0.11504996, 1.6730974 , -0.74654967], dtype=float32)>,\n", |
|
|
|
" <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.138], dtype=float32)>)" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 21, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#解析一行\n", |
|
|
|
"def parse_csv_line(line, n_fields = 9):\n", |
|
|
|
" #先写一个默认的格式,就是9个nan,如果从csv中读取缺失数据,就会变为nan\n", |
|
|
|
" defs = [tf.constant(np.nan)] * n_fields\n", |
|
|
|
" #使用decode_csv解析\n", |
|
|
|
" parsed_fields = tf.io.decode_csv(line, record_defaults=defs)\n", |
|
|
|
" #前8个是x,最后一个是y\n", |
|
|
|
" x = tf.stack(parsed_fields[0:-1])\n", |
|
|
|
" y = tf.stack(parsed_fields[-1:])\n", |
|
|
|
" return x, y\n", |
|
|
|
"\n", |
|
|
|
"parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',\n", |
|
|
|
" n_fields=9)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 22, |
|
|
|
"metadata": { |
|
|
|
"collapsed": true |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"<BatchDataset shapes: ((None, 8), (None, 1)), types: (tf.float32, tf.float32)>\n", |
|
|
|
"--------------------------------------------------\n", |
|
|
|
"x:\n", |
|
|
|
"<tf.Tensor: shape=(4, 8), dtype=float32, numpy=\n", |
|
|
|
"array([[ 0.15782312, 0.4323619 , 0.3379948 , -0.01588031, -0.37338907,\n", |
|
|
|
" -0.05305246, 0.80061346, -1.2359096 ],\n", |
|
|
|
" [-1.0591781 , 1.3935647 , -0.02633197, -0.1100676 , -0.6138199 ,\n", |
|
|
|
" -0.09695935, 0.3247131 , -0.03747724],\n", |
|
|
|
" [-0.82195884, 1.8741661 , 0.1821235 , -0.03170019, -0.6011179 ,\n", |
|
|
|
" -0.14337493, 1.0852206 , -0.8613995 ],\n", |
|
|
|
" [ 0.63034356, 1.8741661 , -0.06713215, -0.12543367, -0.19737554,\n", |
|
|
|
" -0.02272263, -0.69240725, 0.72652334]], dtype=float32)>\n", |
|
|
|
"y:\n", |
|
|
|
"<tf.Tensor: shape=(4, 1), dtype=float32, numpy=\n", |
|
|
|
"array([[3.169],\n", |
|
|
|
" [0.672],\n", |
|
|
|
" [1.054],\n", |
|
|
|
" [2.419]], dtype=float32)>\n", |
|
|
|
"x:\n", |
|
|
|
"<tf.Tensor: shape=(4, 8), dtype=float32, numpy=\n", |
|
|
|
"array([[ 0.48530516, -0.8492419 , -0.06530126, -0.02337966, 1.4974351 ,\n", |
|
|
|
" -0.07790658, -0.90236324, 0.78145146],\n", |
|
|
|
" [ 2.2878418 , -1.890545 , 0.66071063, -0.14964779, -0.06672633,\n", |
|
|
|
" 0.44788057, -0.5337738 , 0.56673235],\n", |
|
|
|
" [-0.22235657, 1.3935647 , 0.029913 , 0.0801452 , -0.50948197,\n", |
|
|
|
" -0.06238599, -0.86503774, 0.86134696],\n", |
|
|
|
" [-0.46794146, -0.92934215, 0.11909926, -0.06047011, 0.30344644,\n", |
|
|
|
" -0.02185189, 1.8737221 , -1.0411643 ]], dtype=float32)>\n", |
|
|
|
"y:\n", |
|
|
|
"<tf.Tensor: shape=(4, 1), dtype=float32, numpy=\n", |
|
|
|
"array([[2.956],\n", |
|
|
|
" [3.59 ],\n", |
|
|
|
" [2. ],\n", |
|
|
|
" [1.012]], dtype=float32)>\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 1. filename -> dataset\n", |
|
|
|
"# 2. read file -> dataset -> datasets -> merge\n", |
|
|
|
"# 3. parse csv\n", |
|
|
|
"#完成整个流程\n", |
|
|
|
"def csv_reader_dataset(filenames, n_readers=5,\n", |
|
|
|
" batch_size=32, n_parse_threads=5,\n", |
|
|
|
" shuffle_buffer_size=10000):\n", |
|
|
|
" #把文件名类别变为dataset tensor\n", |
|
|
|
" dataset = tf.data.Dataset.list_files(filenames)\n", |
|
|
|
" #变为repeat dataset可以让读到最后一个样本时,从新去读第一个样本\n", |
|
|
|
" dataset = dataset.repeat()\n", |
|
|
|
" dataset = dataset.interleave(\n", |
|
|
|
" #skip(1)是因为每个文件存了特征名字,target名字\n", |
|
|
|
" lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", |
|
|
|
" cycle_length = n_readers\n", |
|
|
|
" )\n", |
|
|
|
" dataset.shuffle(shuffle_buffer_size) #对数据进行洗牌,混乱\n", |
|
|
|
" #map,通过parse_csv_line对数据集进行映射,map只会给函数传递一个参数,这个参数\n", |
|
|
|
" #就是dataset中的tensor\n", |
|
|
|
" dataset = dataset.map(parse_csv_line,\n", |
|
|
|
" num_parallel_calls=n_parse_threads)\n", |
|
|
|
" dataset = dataset.batch(batch_size)\n", |
|
|
|
" return dataset\n", |
|
|
|
"#这里是一个测试,写4是为了大家理解\n", |
|
|
|
"train_set = csv_reader_dataset(train_filenames, batch_size=4)\n", |
|
|
|
"print(train_set)\n", |
|
|
|
"print('-'*50)\n", |
|
|
|
"i=0\n", |
|
|
|
"#是csv_reader_dataset处理后的结果,\n", |
|
|
|
"for x_batch, y_batch in train_set.take(2):\n", |
|
|
|
"# i=i+1\n", |
|
|
|
" print(\"x:\")\n", |
|
|
|
" pprint.pprint(x_batch)\n", |
|
|
|
" print(\"y:\")\n", |
|
|
|
" pprint.pprint(y_batch)\n", |
|
|
|
"# print(i)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 23, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"CPU times: user 137 ms, sys: 40.3 ms, total: 177 ms\n", |
|
|
|
"Wall time: 160 ms\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"batch_size = 32\n", |
|
|
|
"train_set = csv_reader_dataset(train_filenames,\n", |
|
|
|
" batch_size = batch_size)\n", |
|
|
|
"valid_set = csv_reader_dataset(valid_filenames,\n", |
|
|
|
" batch_size = batch_size)\n", |
|
|
|
"test_set = csv_reader_dataset(test_filenames,\n", |
|
|
|
" batch_size = batch_size)\n", |
|
|
|
"\n", |
|
|
|
"# print(train_set)\n", |
|
|
|
"# print(valid_set)\n", |
|
|
|
"# print(test_set)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 24, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"Epoch 1/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 1.1306 - val_loss: 0.9811\n", |
|
|
|
"Epoch 2/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 2.4388 - val_loss: 0.5692\n", |
|
|
|
"Epoch 3/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.5545 - val_loss: 0.6181\n", |
|
|
|
"Epoch 4/100\n", |
|
|
|
"348/348 [==============================] - 1s 4ms/step - loss: 0.6097 - val_loss: 0.4497\n", |
|
|
|
"Epoch 5/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.4277 - val_loss: 0.4555\n", |
|
|
|
"Epoch 6/100\n", |
|
|
|
"348/348 [==============================] - 1s 4ms/step - loss: 0.3998 - val_loss: 0.3870\n", |
|
|
|
"Epoch 7/100\n", |
|
|
|
"348/348 [==============================] - 1s 4ms/step - loss: 0.3889 - val_loss: 0.4119\n", |
|
|
|
"Epoch 8/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.3831 - val_loss: 0.3941\n", |
|
|
|
"Epoch 9/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.3870 - val_loss: 0.4068\n", |
|
|
|
"Epoch 10/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.3689 - val_loss: 0.3801\n", |
|
|
|
"Epoch 11/100\n", |
|
|
|
"348/348 [==============================] - 1s 3ms/step - loss: 0.3804 - val_loss: 0.3957\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"#我们知道长度为8\n", |
|
|
|
"model = keras.models.Sequential([\n", |
|
|
|
" keras.layers.Dense(30, activation='relu',\n", |
|
|
|
" input_shape=[8]),\n", |
|
|
|
" keras.layers.Dense(1),\n", |
|
|
|
"])\n", |
|
|
|
"model.compile(loss=\"mean_squared_error\", optimizer=\"sgd\")\n", |
|
|
|
"callbacks = [keras.callbacks.EarlyStopping(\n", |
|
|
|
" patience=5, min_delta=1e-2)]\n", |
|
|
|
"\n", |
|
|
|
"#当是BatchDataset,必须制定steps_per_epoch,validation_steps\n", |
|
|
|
"history = model.fit(train_set,\n", |
|
|
|
" validation_data = valid_set,\n", |
|
|
|
" steps_per_epoch = 11160 // batch_size, #每epoch训练的步数\n", |
|
|
|
" validation_steps = 3870 // batch_size,\n", |
|
|
|
" epochs = 100,\n", |
|
|
|
" callbacks = callbacks)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 25, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"161/161 [==============================] - 0s 2ms/step - loss: 0.3995\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"0.39946985244750977" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 25, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"model.evaluate(test_set, steps = 5160 // batch_size)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 37, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"[<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 1, 2, 3])>,\n", |
|
|
|
" <tf.Tensor: shape=(4,), dtype=int64, numpy=array([4, 5, 6, 7])>]" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 37, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"dataset = tf.data.Dataset.range(8)\n", |
|
|
|
"dataset = dataset.batch(4) #把tensor组合到一起,就是分了batch\n", |
|
|
|
"list(dataset)" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python 3", |
|
|
|
"language": "python", |
|
|
|
"name": "python3" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.6.9" |
|
|
|
} |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 2 |
|
|
|
} |