From c5d7ace6a60b3933c500910b9eea7b01cc0a9c9a Mon Sep 17 00:00:00 2001 From: York You <573861119@qq.com> Date: Mon, 31 Oct 2022 11:00:38 +0800 Subject: [PATCH] Add JSON data parse Signed-off-by: York You <573861119@qq.com> --- lib/sedna/datasources/__init__.py | 371 ++++++++++++++++++------------ 1 file changed, 219 insertions(+), 152 deletions(-) diff --git a/lib/sedna/datasources/__init__.py b/lib/sedna/datasources/__init__.py index 8190b76f..35f8502f 100644 --- a/lib/sedna/datasources/__init__.py +++ b/lib/sedna/datasources/__init__.py @@ -1,152 +1,219 @@ -# Copyright 2021 The KubeEdge Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC - -import numpy as np -import pandas as pd - -from sedna.common.file_ops import FileOps -from sedna.common.class_factory import ClassFactory, ClassType - -__all__ = ('BaseDataSource', 'TxtDataParse', 'CSVDataParse') - - -class BaseDataSource: - """ - An abstract class representing a :class:`BaseDataSource`. - - All datasets that represent a map from keys to data samples should subclass - it. All subclasses should overwrite parse`, supporting get train/eval/infer - data by a function. Subclasses could also optionally overwrite `__len__`, - which is expected to return the size of the dataset.overwrite `x` for the - feature-embedding, `y` for the target label. - - Parameters - ---------- - data_type : str - define the datasource is train/eval/test - func: function - function use to parse an iter object batch by batch - """ - - def __init__(self, data_type="train", func=None): - self.data_type = data_type # sample type: train/eval/test - self.process_func = None - if callable(func): - self.process_func = func - elif func: - self.process_func = ClassFactory.get_cls( - ClassType.CALLBACK, func)() - self.x = None # sample feature - self.y = None # sample label - self.meta_attr = None # special in lifelong learning - - def num_examples(self) -> int: - return len(self.x) - - def __len__(self): - return self.num_examples() - - def parse(self, *args, **kwargs): - raise NotImplementedError - - @property - def is_test_data(self): - return self.data_type == "test" - - def save(self, output=""): - return FileOps.dump(self, output) - - -class TxtDataParse(BaseDataSource, ABC): - """ - txt file which contain image list parser - """ - - def __init__(self, data_type, func=None): - super(TxtDataParse, self).__init__(data_type=data_type, func=func) - - def parse(self, *args, **kwargs): - x_data = [] - y_data = [] - use_raw = kwargs.get("use_raw") - for f in args: - if not (f and FileOps.exists(f)): - continue - with open(f) as fin: - if self.process_func: - res = list(map(self.process_func, [ - line.strip() for line in fin.readlines()])) - else: - res = [line.strip().split() for line in fin.readlines()] - for tup in res: - if not len(tup): - continue - if use_raw: - x_data.append(tup) - else: - x_data.append(tup[0]) - if not self.is_test_data: - if len(tup) > 1: - y_data.append(tup[1]) - else: - y_data.append(0) - self.x = np.array(x_data) - self.y = np.array(y_data) - - -class CSVDataParse(BaseDataSource, ABC): - """ - csv file which contain Structured Data parser - """ - - def __init__(self, data_type, func=None): - super(CSVDataParse, self).__init__(data_type=data_type, func=func) - - @staticmethod - def parse_json(lines: dict, **kwargs) -> pd.DataFrame: - return pd.DataFrame.from_dict([lines], **kwargs) - - def parse(self, *args, **kwargs): - x_data = [] - y_data = [] - label = kwargs.pop("label") if "label" in kwargs else "" - usecols = kwargs.get("usecols", "") - if usecols and isinstance(usecols, str): - usecols = usecols.split(",") - if len(usecols): - if label and label not in usecols: - usecols.append(label) - kwargs["usecols"] = usecols - for f in args: - if isinstance(f, (dict, list)): - res = self.parse_json(f, **kwargs) - else: - if not (f and FileOps.exists(f)): - continue - res = pd.read_csv(f, **kwargs) - if self.process_func and callable(self.process_func): - res = self.process_func(res) - if label: - if label not in res.columns: - continue - y = res[label] - y_data.append(y) - res.drop(label, axis=1, inplace=True) - x_data.append(res) - if not x_data: - return - self.x = pd.concat(x_data) - self.y = pd.concat(y_data) +# Copyright 2021 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC +from pathlib import Path + +import numpy as np +import pandas as pd +from pycocotools.coco import COCO + +from sedna.common.file_ops import FileOps +from sedna.common.class_factory import ClassFactory, ClassType + +__all__ = ('BaseDataSource', 'TxtDataParse', 'CSVDataParse', 'JSONDataParse') + + +class BaseDataSource: + """ + An abstract class representing a :class:`BaseDataSource`. + + All datasets that represent a map from keys to data samples should subclass + it. All subclasses should overwrite parse`, supporting get train/eval/infer + data by a function. Subclasses could also optionally overwrite `__len__`, + which is expected to return the size of the dataset.overwrite `x` for the + feature-embedding, `y` for the target label. + + Parameters + ---------- + data_type : str + define the datasource is train/eval/test + func: function + function use to parse an iter object batch by batch + """ + + def __init__(self, data_type="train", func=None): + self.data_type = data_type # sample type: train/eval/test + self.process_func = None + if callable(func): + self.process_func = func + elif func: + self.process_func = ClassFactory.get_cls( + ClassType.CALLBACK, func)() + self.x = None # sample feature + self.y = None # sample label + self.meta_attr = None # special in lifelong learning + + def num_examples(self) -> int: + return len(self.x) + + def __len__(self): + return self.num_examples() + + def parse(self, *args, **kwargs): + raise NotImplementedError + + @property + def is_test_data(self): + return self.data_type == "test" + + def save(self, output=""): + return FileOps.dump(self, output) + + +class TxtDataParse(BaseDataSource, ABC): + """ + txt file which contain image list parser + """ + + def __init__(self, data_type, func=None): + super(TxtDataParse, self).__init__(data_type=data_type, func=func) + + def parse(self, *args, **kwargs): + x_data = [] + y_data = [] + use_raw = kwargs.get("use_raw") + for f in args: + if not (f and FileOps.exists(f)): + continue + with open(f) as fin: + if self.process_func: + res = list(map(self.process_func, [ + line.strip() for line in fin.readlines()])) + else: + res = [line.strip().split() for line in fin.readlines()] + for tup in res: + if not len(tup): + continue + if use_raw: + x_data.append(tup) + else: + x_data.append(tup[0]) + if not self.is_test_data: + if len(tup) > 1: + y_data.append(tup[1]) + else: + y_data.append(0) + self.x = np.array(x_data) + self.y = np.array(y_data) + + +class CSVDataParse(BaseDataSource, ABC): + """ + csv file which contain Structured Data parser + """ + + def __init__(self, data_type, func=None): + super(CSVDataParse, self).__init__(data_type=data_type, func=func) + + @staticmethod + def parse_json(lines: dict, **kwargs) -> pd.DataFrame: + return pd.DataFrame.from_dict([lines], **kwargs) + + def parse(self, *args, **kwargs): + x_data = [] + y_data = [] + label = kwargs.pop("label") if "label" in kwargs else "" + usecols = kwargs.get("usecols", "") + if usecols and isinstance(usecols, str): + usecols = usecols.split(",") + if len(usecols): + if label and label not in usecols: + usecols.append(label) + kwargs["usecols"] = usecols + for f in args: + if isinstance(f, (dict, list)): + res = self.parse_json(f, **kwargs) + else: + if not (f and FileOps.exists(f)): + continue + res = pd.read_csv(f, **kwargs) + if self.process_func and callable(self.process_func): + res = self.process_func(res) + if label: + if label not in res.columns: + continue + y = res[label] + y_data.append(y) + res.drop(label, axis=1, inplace=True) + x_data.append(res) + if not x_data: + return + self.x = pd.concat(x_data) + self.y = pd.concat(y_data) + + +class JSONDataParse(BaseDataSource, ABC): + """ + json file which contain Structured Data parser + """ + + def __init__(self, data_type, func=None): + super(JSONDataParse, self).__init__(data_type=data_type, func=func) + self.data_dir = None + self.coco = None + self.ids = None + self.class_ids = None + self.annotations = None + + def parse(self, *args, **kwargs): + DIRECTORY = "train" + LABEL_PATH = "*/gt/gt_val_half.txt" + filepath = Path(*args) + self.data_dir = Path(Path(filepath).parents[1], DIRECTORY) + self.coco = COCO(filepath) + self.ids = self.coco.getImgIds() + self.class_ids = sorted(self.coco.getCatIds()) + self.annotations = [self.load_anno_from_ids(_ids) for _ids in self.ids] + self.x = { + "data_dir": self.data_dir, + "coco": self.coco, + "ids": self.ids, + "class_ids": self.class_ids, + "annotations": self.annotations, + } + self.y = list(self.data_dir.glob(LABEL_PATH)) + + def load_anno_from_ids(self, id_): + im_ann = self.coco.loadImgs(id_)[0] + width = im_ann["width"] + height = im_ann["height"] + frame_id = im_ann["frame_id"] + video_id = im_ann["video_id"] + anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) + annotations = self.coco.loadAnns(anno_ids) + objs = [] + for obj in annotations: + if obj["area"] > 0 and obj["bbox"][2] >= 0 and obj["bbox"][3] >= 0: + obj["clean_bbox"] = [ + obj["bbox"][0], obj["bbox"][1], + obj["bbox"][0] + obj["bbox"][2], + obj["bbox"][1] + obj["bbox"][3]] + objs.append(obj) + + res = np.zeros((len(objs), 6)) + + for i, obj in enumerate(objs): + res[i, 0:4] = obj["clean_bbox"] + res[i, 4] = self.class_ids.index(obj["category_id"]) + res[i, 5] = obj["track_id"] + + file_name = ( + im_ann["file_name"] if "file_name" in im_ann + else f"{id_:012}.jpg") + img_info = (height, width, frame_id, video_id, file_name) + + del im_ann, annotations + + return (res, img_info, file_name)