diff --git a/tf02_data_generate_csv.ipynb b/tf02_data_generate_csv.ipynb new file mode 100644 index 0000000..ad4d35b --- /dev/null +++ b/tf02_data_generate_csv.ipynb @@ -0,0 +1,853 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.2.0\n", + "sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)\n", + "matplotlib 3.3.4\n", + "numpy 1.19.5\n", + "pandas 1.1.5\n", + "sklearn 0.24.2\n", + "tensorflow 2.2.0\n", + "tensorflow.keras 2.3.0-tf\n" + ] + } + ], + "source": [ + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import sklearn\n", + "import pandas as pd\n", + "import os\n", + "import sys\n", + "import time\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow import keras\n", + "\n", + "print(tf.__version__)\n", + "print(sys.version_info)\n", + "for module in mpl, np, pd, sklearn, tf, keras:\n", + " print(module.__name__, module.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_california_housing\n", + "\n", + "housing = fetch_california_housing()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(11610, 8) (11610,)\n", + "(3870, 8) (3870,)\n", + "(5160, 8) (5160,)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "x_train_all, x_test, y_train_all, y_test = train_test_split(\n", + " housing.data, housing.target, random_state = 7)\n", + "x_train, x_valid, y_train, y_valid = train_test_split(\n", + " x_train_all, y_train_all, random_state = 11)\n", + "print(x_train.shape, y_train.shape)\n", + "print(x_valid.shape, y_valid.shape)\n", + "print(x_test.shape, y_test.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler()\n", + "x_train_scaled = scaler.fit_transform(x_train)\n", + "x_valid_scaled = scaler.transform(x_valid)\n", + "x_test_scaled = scaler.transform(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf generate_csv" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf01-dataset_basic_api.ipynb tf03-tfrecord_basic_api.ipynb\r\n", + "tf02_data_generate_csv.ipynb tf04_data_generate_tfrecord.ipynb\r\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.ndarray" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(x_train_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['.ipynb_checkpoints',\n", + " 'tf02_data_generate_csv.ipynb',\n", + " 'tf04_data_generate_tfrecord.ipynb',\n", + " 'tf03-tfrecord_basic_api.ipynb',\n", + " 'tf01-dataset_basic_api.ipynb']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.listdir()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 [0 1 2 3 4]\n", + "1 [5 6 7 8 9]\n", + "2 [10 11 12 13 14]\n", + "3 [15 16 17 18 19]\n" + ] + } + ], + "source": [ + "#为了把数据分好\n", + "for file_idx, row_indices in enumerate(np.array_split(np.arange(20), 4)):\n", + " print(file_idx,row_indices)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "#下面要把特征工程后的数据存为csv文件\n", + "output_dir = \"generate_csv\"\n", + "if not os.path.exists(output_dir):\n", + " os.mkdir(output_dir)\n", + "\n", + "#save_to_csv是工作可以直接复用的\n", + "def save_to_csv(output_dir, data, name_prefix,\n", + " header=None, n_parts=10):\n", + " #生成文件名 格式generate_csv/{}_{:02d}.csv\n", + " path_format = os.path.join(output_dir, \"{}_{:02d}.csv\") \n", + " filenames = []\n", + " #把数据分为n_parts部分,写到文件中去\n", + " for file_idx, row_indices in enumerate(\n", + " np.array_split(np.arange(len(data)), n_parts)):\n", + " #print(file_idx,row_indices)\n", + " #生成子文件名\n", + " part_csv = path_format.format(name_prefix, file_idx)\n", + " filenames.append(part_csv) #文件名添加到列表\n", + " with open(part_csv, \"w\", encoding=\"utf-8\") as f:\n", + " #先写头部\n", + " if header is not None:\n", + " f.write(header + \"\\n\")\n", + " for row_index in row_indices:\n", + " #把字符串化后的每个字符串用逗号拼接起来\n", + " f.write(\",\".join(\n", + " [repr(col) for col in data[row_index]]))\n", + " f.write('\\n')\n", + " return filenames\n", + "#np.c_把x和y合并起来,按轴1合并\n", + "train_data = np.c_[x_train_scaled, y_train]\n", + "valid_data = np.c_[x_valid_scaled, y_valid]\n", + "test_data = np.c_[x_test_scaled, y_test]\n", + "#头部,特征,也有目标\n", + "header_cols = housing.feature_names + [\"MidianHouseValue\"]\n", + "#把列表变为字符串\n", + "header_str = \",\".join(header_cols)\n", + "print(header_str)\n", + "print('-'*50)\n", + "train_filenames = save_to_csv(output_dir, train_data, \"train\",\n", + " header_str, n_parts=20)\n", + "valid_filenames = save_to_csv(output_dir, valid_data, \"valid\",\n", + " header_str, n_parts=10)\n", + "test_filenames = save_to_csv(output_dir, test_data, \"test\",\n", + " header_str, n_parts=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "temp_array=np.array([[1,2,3],[4,5,6]])\n", + "np.savetxt(\"temp.csv\",temp_array) #savetxt会自动将整型数或者浮点数转为字符串存储" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.000000000000000000e+00 2.000000000000000000e+00 3.000000000000000000e+00\r\n", + "4.000000000000000000e+00 5.000000000000000000e+00 6.000000000000000000e+00\r\n" + ] + } + ], + "source": [ + "!cat temp.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['generate_csv/train_00.csv', 'generate_csv/train_01.csv', 'generate_csv/train_02.csv', 'generate_csv/train_03.csv', 'generate_csv/train_04.csv', 'generate_csv/train_05.csv', 'generate_csv/train_06.csv', 'generate_csv/train_07.csv', 'generate_csv/train_08.csv', 'generate_csv/train_09.csv', 'generate_csv/train_10.csv', 'generate_csv/train_11.csv', 'generate_csv/train_12.csv', 'generate_csv/train_13.csv', 'generate_csv/train_14.csv', 'generate_csv/train_15.csv', 'generate_csv/train_16.csv', 'generate_csv/train_17.csv', 'generate_csv/train_18.csv', 'generate_csv/train_19.csv']\n", + "train filenames:\n", + "['generate_csv/train_00.csv',\n", + " 'generate_csv/train_01.csv',\n", + " 'generate_csv/train_02.csv',\n", + " 'generate_csv/train_03.csv',\n", + " 'generate_csv/train_04.csv',\n", + " 'generate_csv/train_05.csv',\n", + " 'generate_csv/train_06.csv',\n", + " 'generate_csv/train_07.csv',\n", + " 'generate_csv/train_08.csv',\n", + " 'generate_csv/train_09.csv',\n", + " 'generate_csv/train_10.csv',\n", + " 'generate_csv/train_11.csv',\n", + " 'generate_csv/train_12.csv',\n", + " 'generate_csv/train_13.csv',\n", + " 'generate_csv/train_14.csv',\n", + " 'generate_csv/train_15.csv',\n", + " 'generate_csv/train_16.csv',\n", + " 'generate_csv/train_17.csv',\n", + " 'generate_csv/train_18.csv',\n", + " 'generate_csv/train_19.csv']\n", + "valid filenames:\n", + "['generate_csv/valid_00.csv',\n", + " 'generate_csv/valid_01.csv',\n", + " 'generate_csv/valid_02.csv',\n", + " 'generate_csv/valid_03.csv',\n", + " 'generate_csv/valid_04.csv',\n", + " 'generate_csv/valid_05.csv',\n", + " 'generate_csv/valid_06.csv',\n", + " 'generate_csv/valid_07.csv',\n", + " 'generate_csv/valid_08.csv',\n", + " 'generate_csv/valid_09.csv']\n", + "test filenames:\n", + "['generate_csv/test_00.csv',\n", + " 'generate_csv/test_01.csv',\n", + " 'generate_csv/test_02.csv',\n", + " 'generate_csv/test_03.csv',\n", + " 'generate_csv/test_04.csv',\n", + " 'generate_csv/test_05.csv',\n", + " 'generate_csv/test_06.csv',\n", + " 'generate_csv/test_07.csv',\n", + " 'generate_csv/test_08.csv',\n", + " 'generate_csv/test_09.csv']\n" + ] + } + ], + "source": [ + "#看下生成文件的文件名\n", + "print(train_filenames)\n", + "import pprint #为了打印美观性\n", + "print(\"train filenames:\")\n", + "pprint.pprint(train_filenames)\n", + "print(\"valid filenames:\")\n", + "pprint.pprint(valid_filenames)\n", + "print(\"test filenames:\")\n", + "pprint.pprint(test_filenames)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n" + ] + } + ], + "source": [ + "# 1. filename -> dataset\n", + "# 2. read file -> dataset -> datasets -> merge\n", + "# 3. parse csv\n", + "#list_files把文件名搞为一个dataset\n", + "# list_files默认行为是按不确定的随机混排顺序返回文件名\n", + "filename_dataset = tf.data.Dataset.list_files(train_filenames)\n", + "for filename in filename_dataset:\n", + " print(filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", + "tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n" + ] + } + ], + "source": [ + "filename_mydataset=tf.data.Dataset.from_tensor_slices(train_filenames)\n", + "filename_mydataset=filename_mydataset.repeat(1)\n", + "for i in filename_mydataset:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 把数据从文件中拿出来" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.2980728090942217,0.3522616607867429,-0.10920507530549702,-0.25055520947444,-0.034064024638222286,-0.006034004264459185,1.080554840130013,-1.0611381656679573,1.514', shape=(), dtype=string)\n", + "tf.Tensor(b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352', shape=(), dtype=string)\n", + "tf.Tensor(b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.8757754235423053,1.874166156711919,-0.9487499555702599,-0.09657184824705009,-0.7163432355284542,-0.07790191228558485,0.9825753570271144,-1.4206678547327694,2.75', shape=(), dtype=string)\n", + "tf.Tensor(b'0.15782311132800697,0.43236189741438374,0.3379948076652917,-0.015880306122244434,-0.3733890577139493,-0.05305245634489608,0.8006134598360177,-1.2359095422966828,3.169', shape=(), dtype=string)\n", + "tf.Tensor(b'2.2878417437355094,-1.8905449647872008,0.6607106467795992,-0.14964778023694128,-0.06672632728722275,0.44788055801575993,-0.5337737862320228,0.5667323709310584,3.59', shape=(), dtype=string)\n", + "tf.Tensor(b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.2223565745313433,1.393564736946074,0.02991299565857307,0.0801452044790158,-0.509481985418118,-0.06238599304952824,-0.86503775291325,0.8613469772480595,2.0', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.03058829290446139,-0.9293421252555106,0.2596214817762415,-0.00601274044096368,-0.5004091235711734,-0.030779867916061836,1.5984463936739026,-1.8151518191233238,1.598', shape=(), dtype=string)\n", + "tf.Tensor(b'1.9063832474401923,0.5124621340420246,0.44758280183798754,-0.276721775345798,-0.6310583341671753,-0.07081146722873086,-0.7064043040799849,0.7464972154634646,5.00001', shape=(), dtype=string)\n", + "tf.Tensor(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138', shape=(), dtype=string)\n", + "tf.Tensor(b'0.29422955783115173,1.874166156711919,0.004626028663628252,-0.28479278487900694,-0.5602900117610076,-0.1196496378702887,1.3558305307524392,-0.9512818717870428,1.625', shape=(), dtype=string)\n", + "tf.Tensor(b'0.7751155655229017,1.874166156711919,0.15645971958808144,-0.18905190538070707,-0.6292437617977863,-0.08791603438866835,-0.7483955111240856,0.5717258388347319,4.851', shape=(), dtype=string)\n" + ] + } + ], + "source": [ + "#一访问list_files的dataset对象就随机了文件顺序\n", + "# for filename in filename_dataset:\n", + "# print(filename)\n", + "n_readers = 5\n", + "dataset = filename_mydataset.interleave(\n", + " #前面1行是header\n", + "# lambda filename: tf.data.TextLineDataset(filename),\n", + " #不带header,把特征名字去掉\n", + " lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", + " cycle_length = n_readers, #cycle_length和block_length增加获取了数据的随机性\n", + " block_length=2\n", + ")\n", + "for line in dataset.take(15):\n", + " print(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 把每一行数据切分为对应类型" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[, , , , ]\n" + ] + } + ], + "source": [ + "#parse csv 解析csv,通过decode_csv\n", + "# tf.io.decode_csv(str, record_defaults)\n", + "\n", + "sample_str = '1,2,3,4,5'\n", + "record_defaults = [\n", + " tf.constant(0, dtype=tf.int32),\n", + " 0,\n", + " np.nan,\n", + " \"hello1\",\n", + " tf.constant([])#没有固定类型,默认是float32\n", + "]\n", + "#sample_str数据格式化,按照record_defaults进行处理\n", + "parsed_fields = tf.io.decode_csv(sample_str, record_defaults)\n", + "print(parsed_fields)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#我们传一个空的字符串测试\n", + "#最后一个为1是可以转换的\n", + "try:\n", + " parsed_fields = tf.io.decode_csv(',,,,1', record_defaults)\n", + "except tf.errors.InvalidArgumentError as ex:\n", + " print(ex)\n", + "parsed_fields" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]\n" + ] + } + ], + "source": [ + "#我们给的值过多的情况\n", + "try:\n", + " parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,', record_defaults)\n", + "except tf.errors.InvalidArgumentError as ex:\n", + " print(ex)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(,\n", + " )" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#解析一行\n", + "def parse_csv_line(line, n_fields = 9):\n", + " #先写一个默认的格式,就是9个nan,如果从csv中读取缺失数据,就会变为nan\n", + " defs = [tf.constant(np.nan)] * n_fields\n", + " #使用decode_csv解析\n", + " parsed_fields = tf.io.decode_csv(line, record_defaults=defs)\n", + " #前8个是x,最后一个是y\n", + " x = tf.stack(parsed_fields[0:-1])\n", + " y = tf.stack(parsed_fields[-1:])\n", + " return x, y\n", + "\n", + "parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',\n", + " n_fields=9)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--------------------------------------------------\n", + "x:\n", + "\n", + "y:\n", + "\n", + "x:\n", + "\n", + "y:\n", + "\n" + ] + } + ], + "source": [ + "# 1. filename -> dataset\n", + "# 2. read file -> dataset -> datasets -> merge\n", + "# 3. parse csv\n", + "#完成整个流程\n", + "def csv_reader_dataset(filenames, n_readers=5,\n", + " batch_size=32, n_parse_threads=5,\n", + " shuffle_buffer_size=10000):\n", + " #把文件名类别变为dataset tensor\n", + " dataset = tf.data.Dataset.list_files(filenames)\n", + " #变为repeat dataset可以让读到最后一个样本时,从新去读第一个样本\n", + " dataset = dataset.repeat()\n", + " dataset = dataset.interleave(\n", + " #skip(1)是因为每个文件存了特征名字,target名字\n", + " lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", + " cycle_length = n_readers\n", + " )\n", + " dataset.shuffle(shuffle_buffer_size) #对数据进行洗牌,混乱\n", + " #map,通过parse_csv_line对数据集进行映射,map只会给函数传递一个参数,这个参数\n", + " #就是dataset中的tensor\n", + " dataset = dataset.map(parse_csv_line,\n", + " num_parallel_calls=n_parse_threads)\n", + " dataset = dataset.batch(batch_size)\n", + " return dataset\n", + "#这里是一个测试,写4是为了大家理解\n", + "train_set = csv_reader_dataset(train_filenames, batch_size=4)\n", + "print(train_set)\n", + "print('-'*50)\n", + "i=0\n", + "#是csv_reader_dataset处理后的结果,\n", + "for x_batch, y_batch in train_set.take(2):\n", + "# i=i+1\n", + " print(\"x:\")\n", + " pprint.pprint(x_batch)\n", + " print(\"y:\")\n", + " pprint.pprint(y_batch)\n", + "# print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 137 ms, sys: 40.3 ms, total: 177 ms\n", + "Wall time: 160 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "batch_size = 32\n", + "train_set = csv_reader_dataset(train_filenames,\n", + " batch_size = batch_size)\n", + "valid_set = csv_reader_dataset(valid_filenames,\n", + " batch_size = batch_size)\n", + "test_set = csv_reader_dataset(test_filenames,\n", + " batch_size = batch_size)\n", + "\n", + "# print(train_set)\n", + "# print(valid_set)\n", + "# print(test_set)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 1.1306 - val_loss: 0.9811\n", + "Epoch 2/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 2.4388 - val_loss: 0.5692\n", + "Epoch 3/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.5545 - val_loss: 0.6181\n", + "Epoch 4/100\n", + "348/348 [==============================] - 1s 4ms/step - loss: 0.6097 - val_loss: 0.4497\n", + "Epoch 5/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.4277 - val_loss: 0.4555\n", + "Epoch 6/100\n", + "348/348 [==============================] - 1s 4ms/step - loss: 0.3998 - val_loss: 0.3870\n", + "Epoch 7/100\n", + "348/348 [==============================] - 1s 4ms/step - loss: 0.3889 - val_loss: 0.4119\n", + "Epoch 8/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.3831 - val_loss: 0.3941\n", + "Epoch 9/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.3870 - val_loss: 0.4068\n", + "Epoch 10/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.3689 - val_loss: 0.3801\n", + "Epoch 11/100\n", + "348/348 [==============================] - 1s 3ms/step - loss: 0.3804 - val_loss: 0.3957\n" + ] + } + ], + "source": [ + "#我们知道长度为8\n", + "model = keras.models.Sequential([\n", + " keras.layers.Dense(30, activation='relu',\n", + " input_shape=[8]),\n", + " keras.layers.Dense(1),\n", + "])\n", + "model.compile(loss=\"mean_squared_error\", optimizer=\"sgd\")\n", + "callbacks = [keras.callbacks.EarlyStopping(\n", + " patience=5, min_delta=1e-2)]\n", + "\n", + "#当是BatchDataset,必须制定steps_per_epoch,validation_steps\n", + "history = model.fit(train_set,\n", + " validation_data = valid_set,\n", + " steps_per_epoch = 11160 // batch_size, #每epoch训练的步数\n", + " validation_steps = 3870 // batch_size,\n", + " epochs = 100,\n", + " callbacks = callbacks)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "161/161 [==============================] - 0s 2ms/step - loss: 0.3995\n" + ] + }, + { + "data": { + "text/plain": [ + "0.39946985244750977" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.evaluate(test_set, steps = 5160 // batch_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = tf.data.Dataset.range(8)\n", + "dataset = dataset.batch(4) #把tensor组合到一起,就是分了batch\n", + "list(dataset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}