{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.2.0\n", "sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)\n", "matplotlib 3.3.4\n", "numpy 1.19.5\n", "pandas 1.1.5\n", "sklearn 0.24.2\n", "tensorflow 2.2.0\n", "tensorflow.keras 2.3.0-tf\n" ] } ], "source": [ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import numpy as np\n", "import sklearn\n", "import pandas as pd\n", "import os\n", "import sys\n", "import time\n", "import tensorflow as tf\n", "\n", "from tensorflow import keras\n", "\n", "print(tf.__version__)\n", "print(sys.version_info)\n", "for module in mpl, np, pd, sklearn, tf, keras:\n", " print(module.__name__, module.__version__)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_california_housing\n", "\n", "housing = fetch_california_housing()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(11610, 8) (11610,)\n", "(3870, 8) (3870,)\n", "(5160, 8) (5160,)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "x_train_all, x_test, y_train_all, y_test = train_test_split(\n", " housing.data, housing.target, random_state = 7)\n", "x_train, x_valid, y_train, y_valid = train_test_split(\n", " x_train_all, y_train_all, random_state = 11)\n", "print(x_train.shape, y_train.shape)\n", "print(x_valid.shape, y_valid.shape)\n", "print(x_test.shape, y_test.shape)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", "x_train_scaled = scaler.fit_transform(x_train)\n", "x_valid_scaled = scaler.transform(x_valid)\n", "x_test_scaled = scaler.transform(x_test)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "!rm -rf generate_csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf01-dataset_basic_api.ipynb tf03-tfrecord_basic_api.ipynb\r\n", "tf02_data_generate_csv.ipynb tf04_data_generate_tfrecord.ipynb\r\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "numpy.ndarray" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(x_train_scaled)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['.ipynb_checkpoints',\n", " 'tf02_data_generate_csv.ipynb',\n", " 'tf04_data_generate_tfrecord.ipynb',\n", " 'tf03-tfrecord_basic_api.ipynb',\n", " 'tf01-dataset_basic_api.ipynb']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.listdir()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 [0 1 2 3 4]\n", "1 [5 6 7 8 9]\n", "2 [10 11 12 13 14]\n", "3 [15 16 17 18 19]\n" ] } ], "source": [ "#为了把数据分好\n", "for file_idx, row_indices in enumerate(np.array_split(np.arange(20), 4)):\n", " print(file_idx,row_indices)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue\n", "--------------------------------------------------\n" ] } ], "source": [ "#下面要把特征工程后的数据存为csv文件\n", "output_dir = \"generate_csv\"\n", "if not os.path.exists(output_dir):\n", " os.mkdir(output_dir)\n", "\n", "#save_to_csv是工作可以直接复用的\n", "def save_to_csv(output_dir, data, name_prefix,\n", " header=None, n_parts=10):\n", " #生成文件名 格式generate_csv/{}_{:02d}.csv\n", " path_format = os.path.join(output_dir, \"{}_{:02d}.csv\") \n", " filenames = []\n", " #把数据分为n_parts部分,写到文件中去\n", " for file_idx, row_indices in enumerate(\n", " np.array_split(np.arange(len(data)), n_parts)):\n", " #print(file_idx,row_indices)\n", " #生成子文件名\n", " part_csv = path_format.format(name_prefix, file_idx)\n", " filenames.append(part_csv) #文件名添加到列表\n", " with open(part_csv, \"w\", encoding=\"utf-8\") as f:\n", " #先写头部\n", " if header is not None:\n", " f.write(header + \"\\n\")\n", " for row_index in row_indices:\n", " #把字符串化后的每个字符串用逗号拼接起来\n", " f.write(\",\".join(\n", " [repr(col) for col in data[row_index]]))\n", " f.write('\\n')\n", " return filenames\n", "#np.c_把x和y合并起来,按轴1合并\n", "train_data = np.c_[x_train_scaled, y_train]\n", "valid_data = np.c_[x_valid_scaled, y_valid]\n", "test_data = np.c_[x_test_scaled, y_test]\n", "#头部,特征,也有目标\n", "header_cols = housing.feature_names + [\"MidianHouseValue\"]\n", "#把列表变为字符串\n", "header_str = \",\".join(header_cols)\n", "print(header_str)\n", "print('-'*50)\n", "train_filenames = save_to_csv(output_dir, train_data, \"train\",\n", " header_str, n_parts=20)\n", "valid_filenames = save_to_csv(output_dir, valid_data, \"valid\",\n", " header_str, n_parts=10)\n", "test_filenames = save_to_csv(output_dir, test_data, \"test\",\n", " header_str, n_parts=10)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "temp_array=np.array([[1,2,3],[4,5,6]])\n", "np.savetxt(\"temp.csv\",temp_array) #savetxt会自动将整型数或者浮点数转为字符串存储" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.000000000000000000e+00 2.000000000000000000e+00 3.000000000000000000e+00\r\n", "4.000000000000000000e+00 5.000000000000000000e+00 6.000000000000000000e+00\r\n" ] } ], "source": [ "!cat temp.csv" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['generate_csv/train_00.csv', 'generate_csv/train_01.csv', 'generate_csv/train_02.csv', 'generate_csv/train_03.csv', 'generate_csv/train_04.csv', 'generate_csv/train_05.csv', 'generate_csv/train_06.csv', 'generate_csv/train_07.csv', 'generate_csv/train_08.csv', 'generate_csv/train_09.csv', 'generate_csv/train_10.csv', 'generate_csv/train_11.csv', 'generate_csv/train_12.csv', 'generate_csv/train_13.csv', 'generate_csv/train_14.csv', 'generate_csv/train_15.csv', 'generate_csv/train_16.csv', 'generate_csv/train_17.csv', 'generate_csv/train_18.csv', 'generate_csv/train_19.csv']\n", "train filenames:\n", "['generate_csv/train_00.csv',\n", " 'generate_csv/train_01.csv',\n", " 'generate_csv/train_02.csv',\n", " 'generate_csv/train_03.csv',\n", " 'generate_csv/train_04.csv',\n", " 'generate_csv/train_05.csv',\n", " 'generate_csv/train_06.csv',\n", " 'generate_csv/train_07.csv',\n", " 'generate_csv/train_08.csv',\n", " 'generate_csv/train_09.csv',\n", " 'generate_csv/train_10.csv',\n", " 'generate_csv/train_11.csv',\n", " 'generate_csv/train_12.csv',\n", " 'generate_csv/train_13.csv',\n", " 'generate_csv/train_14.csv',\n", " 'generate_csv/train_15.csv',\n", " 'generate_csv/train_16.csv',\n", " 'generate_csv/train_17.csv',\n", " 'generate_csv/train_18.csv',\n", " 'generate_csv/train_19.csv']\n", "valid filenames:\n", "['generate_csv/valid_00.csv',\n", " 'generate_csv/valid_01.csv',\n", " 'generate_csv/valid_02.csv',\n", " 'generate_csv/valid_03.csv',\n", " 'generate_csv/valid_04.csv',\n", " 'generate_csv/valid_05.csv',\n", " 'generate_csv/valid_06.csv',\n", " 'generate_csv/valid_07.csv',\n", " 'generate_csv/valid_08.csv',\n", " 'generate_csv/valid_09.csv']\n", "test filenames:\n", "['generate_csv/test_00.csv',\n", " 'generate_csv/test_01.csv',\n", " 'generate_csv/test_02.csv',\n", " 'generate_csv/test_03.csv',\n", " 'generate_csv/test_04.csv',\n", " 'generate_csv/test_05.csv',\n", " 'generate_csv/test_06.csv',\n", " 'generate_csv/test_07.csv',\n", " 'generate_csv/test_08.csv',\n", " 'generate_csv/test_09.csv']\n" ] } ], "source": [ "#看下生成文件的文件名\n", "print(train_filenames)\n", "import pprint #为了打印美观性\n", "print(\"train filenames:\")\n", "pprint.pprint(train_filenames)\n", "print(\"valid filenames:\")\n", "pprint.pprint(valid_filenames)\n", "print(\"test filenames:\")\n", "pprint.pprint(test_filenames)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n" ] } ], "source": [ "# 1. filename -> dataset\n", "# 2. read file -> dataset -> datasets -> merge\n", "# 3. parse csv\n", "#list_files把文件名搞为一个dataset\n", "# list_files默认行为是按不确定的随机混排顺序返回文件名\n", "filename_dataset = tf.data.Dataset.list_files(train_filenames)\n", "for filename in filename_dataset:\n", " print(filename)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)\n", "tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)\n" ] } ], "source": [ "filename_mydataset=tf.data.Dataset.from_tensor_slices(train_filenames)\n", "filename_mydataset=filename_mydataset.repeat(1)\n", "for i in filename_mydataset:\n", " print(i)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 把数据从文件中拿出来" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226', shape=(), dtype=string)\n", "tf.Tensor(b'-0.2980728090942217,0.3522616607867429,-0.10920507530549702,-0.25055520947444,-0.034064024638222286,-0.006034004264459185,1.080554840130013,-1.0611381656679573,1.514', shape=(), dtype=string)\n", "tf.Tensor(b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147', shape=(), dtype=string)\n", "tf.Tensor(b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352', shape=(), dtype=string)\n", "tf.Tensor(b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512', shape=(), dtype=string)\n", "tf.Tensor(b'-0.8757754235423053,1.874166156711919,-0.9487499555702599,-0.09657184824705009,-0.7163432355284542,-0.07790191228558485,0.9825753570271144,-1.4206678547327694,2.75', shape=(), dtype=string)\n", "tf.Tensor(b'0.15782311132800697,0.43236189741438374,0.3379948076652917,-0.015880306122244434,-0.3733890577139493,-0.05305245634489608,0.8006134598360177,-1.2359095422966828,3.169', shape=(), dtype=string)\n", "tf.Tensor(b'2.2878417437355094,-1.8905449647872008,0.6607106467795992,-0.14964778023694128,-0.06672632728722275,0.44788055801575993,-0.5337737862320228,0.5667323709310584,3.59', shape=(), dtype=string)\n", "tf.Tensor(b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672', shape=(), dtype=string)\n", "tf.Tensor(b'-0.2223565745313433,1.393564736946074,0.02991299565857307,0.0801452044790158,-0.509481985418118,-0.06238599304952824,-0.86503775291325,0.8613469772480595,2.0', shape=(), dtype=string)\n", "tf.Tensor(b'-0.03058829290446139,-0.9293421252555106,0.2596214817762415,-0.00601274044096368,-0.5004091235711734,-0.030779867916061836,1.5984463936739026,-1.8151518191233238,1.598', shape=(), dtype=string)\n", "tf.Tensor(b'1.9063832474401923,0.5124621340420246,0.44758280183798754,-0.276721775345798,-0.6310583341671753,-0.07081146722873086,-0.7064043040799849,0.7464972154634646,5.00001', shape=(), dtype=string)\n", "tf.Tensor(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138', shape=(), dtype=string)\n", "tf.Tensor(b'0.29422955783115173,1.874166156711919,0.004626028663628252,-0.28479278487900694,-0.5602900117610076,-0.1196496378702887,1.3558305307524392,-0.9512818717870428,1.625', shape=(), dtype=string)\n", "tf.Tensor(b'0.7751155655229017,1.874166156711919,0.15645971958808144,-0.18905190538070707,-0.6292437617977863,-0.08791603438866835,-0.7483955111240856,0.5717258388347319,4.851', shape=(), dtype=string)\n" ] } ], "source": [ "#一访问list_files的dataset对象就随机了文件顺序\n", "# for filename in filename_dataset:\n", "# print(filename)\n", "n_readers = 5\n", "dataset = filename_mydataset.interleave(\n", " #前面1行是header\n", "# lambda filename: tf.data.TextLineDataset(filename),\n", " #不带header,把特征名字去掉\n", " lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", " cycle_length = n_readers, #cycle_length和block_length增加获取了数据的随机性\n", " block_length=2\n", ")\n", "for line in dataset.take(15):\n", " print(line)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 把每一行数据切分为对应类型" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[, , , , ]\n" ] } ], "source": [ "#parse csv 解析csv,通过decode_csv\n", "# tf.io.decode_csv(str, record_defaults)\n", "\n", "sample_str = '1,2,3,4,5'\n", "record_defaults = [\n", " tf.constant(0, dtype=tf.int32),\n", " 0,\n", " np.nan,\n", " \"hello1\",\n", " tf.constant([])#没有固定类型,默认是float32\n", "]\n", "#sample_str数据格式化,按照record_defaults进行处理\n", "parsed_fields = tf.io.decode_csv(sample_str, record_defaults)\n", "print(parsed_fields)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#我们传一个空的字符串测试\n", "#最后一个为1是可以转换的\n", "try:\n", " parsed_fields = tf.io.decode_csv(',,,,1', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)\n", "parsed_fields" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]\n" ] } ], "source": [ "#我们给的值过多的情况\n", "try:\n", " parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(,\n", " )" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#解析一行\n", "def parse_csv_line(line, n_fields = 9):\n", " #先写一个默认的格式,就是9个nan,如果从csv中读取缺失数据,就会变为nan\n", " defs = [tf.constant(np.nan)] * n_fields\n", " #使用decode_csv解析\n", " parsed_fields = tf.io.decode_csv(line, record_defaults=defs)\n", " #前8个是x,最后一个是y\n", " x = tf.stack(parsed_fields[0:-1])\n", " y = tf.stack(parsed_fields[-1:])\n", " return x, y\n", "\n", "parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',\n", " n_fields=9)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--------------------------------------------------\n", "x:\n", "\n", "y:\n", "\n", "x:\n", "\n", "y:\n", "\n" ] } ], "source": [ "# 1. filename -> dataset\n", "# 2. read file -> dataset -> datasets -> merge\n", "# 3. parse csv\n", "#完成整个流程\n", "def csv_reader_dataset(filenames, n_readers=5,\n", " batch_size=32, n_parse_threads=5,\n", " shuffle_buffer_size=10000):\n", " #把文件名类别变为dataset tensor\n", " dataset = tf.data.Dataset.list_files(filenames)\n", " #变为repeat dataset可以让读到最后一个样本时,从新去读第一个样本\n", " dataset = dataset.repeat()\n", " dataset = dataset.interleave(\n", " #skip(1)是因为每个文件存了特征名字,target名字\n", " lambda filename: tf.data.TextLineDataset(filename).skip(1),\n", " cycle_length = n_readers\n", " )\n", " dataset.shuffle(shuffle_buffer_size) #对数据进行洗牌,混乱\n", " #map,通过parse_csv_line对数据集进行映射,map只会给函数传递一个参数,这个参数\n", " #就是dataset中的tensor\n", " dataset = dataset.map(parse_csv_line,\n", " num_parallel_calls=n_parse_threads)\n", " dataset = dataset.batch(batch_size)\n", " return dataset\n", "#这里是一个测试,写4是为了大家理解\n", "train_set = csv_reader_dataset(train_filenames, batch_size=4)\n", "print(train_set)\n", "print('-'*50)\n", "i=0\n", "#是csv_reader_dataset处理后的结果,\n", "for x_batch, y_batch in train_set.take(2):\n", "# i=i+1\n", " print(\"x:\")\n", " pprint.pprint(x_batch)\n", " print(\"y:\")\n", " pprint.pprint(y_batch)\n", "# print(i)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 137 ms, sys: 40.3 ms, total: 177 ms\n", "Wall time: 160 ms\n" ] } ], "source": [ "%%time\n", "batch_size = 32\n", "train_set = csv_reader_dataset(train_filenames,\n", " batch_size = batch_size)\n", "valid_set = csv_reader_dataset(valid_filenames,\n", " batch_size = batch_size)\n", "test_set = csv_reader_dataset(test_filenames,\n", " batch_size = batch_size)\n", "\n", "# print(train_set)\n", "# print(valid_set)\n", "# print(test_set)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 1.1306 - val_loss: 0.9811\n", "Epoch 2/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 2.4388 - val_loss: 0.5692\n", "Epoch 3/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.5545 - val_loss: 0.6181\n", "Epoch 4/100\n", "348/348 [==============================] - 1s 4ms/step - loss: 0.6097 - val_loss: 0.4497\n", "Epoch 5/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.4277 - val_loss: 0.4555\n", "Epoch 6/100\n", "348/348 [==============================] - 1s 4ms/step - loss: 0.3998 - val_loss: 0.3870\n", "Epoch 7/100\n", "348/348 [==============================] - 1s 4ms/step - loss: 0.3889 - val_loss: 0.4119\n", "Epoch 8/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.3831 - val_loss: 0.3941\n", "Epoch 9/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.3870 - val_loss: 0.4068\n", "Epoch 10/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.3689 - val_loss: 0.3801\n", "Epoch 11/100\n", "348/348 [==============================] - 1s 3ms/step - loss: 0.3804 - val_loss: 0.3957\n" ] } ], "source": [ "#我们知道长度为8\n", "model = keras.models.Sequential([\n", " keras.layers.Dense(30, activation='relu',\n", " input_shape=[8]),\n", " keras.layers.Dense(1),\n", "])\n", "model.compile(loss=\"mean_squared_error\", optimizer=\"sgd\")\n", "callbacks = [keras.callbacks.EarlyStopping(\n", " patience=5, min_delta=1e-2)]\n", "\n", "#当是BatchDataset,必须制定steps_per_epoch,validation_steps\n", "history = model.fit(train_set,\n", " validation_data = valid_set,\n", " steps_per_epoch = 11160 // batch_size, #每epoch训练的步数\n", " validation_steps = 3870 // batch_size,\n", " epochs = 100,\n", " callbacks = callbacks)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "161/161 [==============================] - 0s 2ms/step - loss: 0.3995\n" ] }, { "data": { "text/plain": [ "0.39946985244750977" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(test_set, steps = 5160 // batch_size)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = tf.data.Dataset.range(8)\n", "dataset = dataset.batch(4) #把tensor组合到一起,就是分了batch\n", "list(dataset)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }