| @@ -1,152 +1,219 @@ | |||||
| # Copyright 2021 The KubeEdge Authors. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| from abc import ABC | |||||
| import numpy as np | |||||
| import pandas as pd | |||||
| from sedna.common.file_ops import FileOps | |||||
| from sedna.common.class_factory import ClassFactory, ClassType | |||||
| __all__ = ('BaseDataSource', 'TxtDataParse', 'CSVDataParse') | |||||
| class BaseDataSource: | |||||
| """ | |||||
| An abstract class representing a :class:`BaseDataSource`. | |||||
| All datasets that represent a map from keys to data samples should subclass | |||||
| it. All subclasses should overwrite parse`, supporting get train/eval/infer | |||||
| data by a function. Subclasses could also optionally overwrite `__len__`, | |||||
| which is expected to return the size of the dataset.overwrite `x` for the | |||||
| feature-embedding, `y` for the target label. | |||||
| Parameters | |||||
| ---------- | |||||
| data_type : str | |||||
| define the datasource is train/eval/test | |||||
| func: function | |||||
| function use to parse an iter object batch by batch | |||||
| """ | |||||
| def __init__(self, data_type="train", func=None): | |||||
| self.data_type = data_type # sample type: train/eval/test | |||||
| self.process_func = None | |||||
| if callable(func): | |||||
| self.process_func = func | |||||
| elif func: | |||||
| self.process_func = ClassFactory.get_cls( | |||||
| ClassType.CALLBACK, func)() | |||||
| self.x = None # sample feature | |||||
| self.y = None # sample label | |||||
| self.meta_attr = None # special in lifelong learning | |||||
| def num_examples(self) -> int: | |||||
| return len(self.x) | |||||
| def __len__(self): | |||||
| return self.num_examples() | |||||
| def parse(self, *args, **kwargs): | |||||
| raise NotImplementedError | |||||
| @property | |||||
| def is_test_data(self): | |||||
| return self.data_type == "test" | |||||
| def save(self, output=""): | |||||
| return FileOps.dump(self, output) | |||||
| class TxtDataParse(BaseDataSource, ABC): | |||||
| """ | |||||
| txt file which contain image list parser | |||||
| """ | |||||
| def __init__(self, data_type, func=None): | |||||
| super(TxtDataParse, self).__init__(data_type=data_type, func=func) | |||||
| def parse(self, *args, **kwargs): | |||||
| x_data = [] | |||||
| y_data = [] | |||||
| use_raw = kwargs.get("use_raw") | |||||
| for f in args: | |||||
| if not (f and FileOps.exists(f)): | |||||
| continue | |||||
| with open(f) as fin: | |||||
| if self.process_func: | |||||
| res = list(map(self.process_func, [ | |||||
| line.strip() for line in fin.readlines()])) | |||||
| else: | |||||
| res = [line.strip().split() for line in fin.readlines()] | |||||
| for tup in res: | |||||
| if not len(tup): | |||||
| continue | |||||
| if use_raw: | |||||
| x_data.append(tup) | |||||
| else: | |||||
| x_data.append(tup[0]) | |||||
| if not self.is_test_data: | |||||
| if len(tup) > 1: | |||||
| y_data.append(tup[1]) | |||||
| else: | |||||
| y_data.append(0) | |||||
| self.x = np.array(x_data) | |||||
| self.y = np.array(y_data) | |||||
| class CSVDataParse(BaseDataSource, ABC): | |||||
| """ | |||||
| csv file which contain Structured Data parser | |||||
| """ | |||||
| def __init__(self, data_type, func=None): | |||||
| super(CSVDataParse, self).__init__(data_type=data_type, func=func) | |||||
| @staticmethod | |||||
| def parse_json(lines: dict, **kwargs) -> pd.DataFrame: | |||||
| return pd.DataFrame.from_dict([lines], **kwargs) | |||||
| def parse(self, *args, **kwargs): | |||||
| x_data = [] | |||||
| y_data = [] | |||||
| label = kwargs.pop("label") if "label" in kwargs else "" | |||||
| usecols = kwargs.get("usecols", "") | |||||
| if usecols and isinstance(usecols, str): | |||||
| usecols = usecols.split(",") | |||||
| if len(usecols): | |||||
| if label and label not in usecols: | |||||
| usecols.append(label) | |||||
| kwargs["usecols"] = usecols | |||||
| for f in args: | |||||
| if isinstance(f, (dict, list)): | |||||
| res = self.parse_json(f, **kwargs) | |||||
| else: | |||||
| if not (f and FileOps.exists(f)): | |||||
| continue | |||||
| res = pd.read_csv(f, **kwargs) | |||||
| if self.process_func and callable(self.process_func): | |||||
| res = self.process_func(res) | |||||
| if label: | |||||
| if label not in res.columns: | |||||
| continue | |||||
| y = res[label] | |||||
| y_data.append(y) | |||||
| res.drop(label, axis=1, inplace=True) | |||||
| x_data.append(res) | |||||
| if not x_data: | |||||
| return | |||||
| self.x = pd.concat(x_data) | |||||
| self.y = pd.concat(y_data) | |||||
| # Copyright 2021 The KubeEdge Authors. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| from abc import ABC | |||||
| from pathlib import Path | |||||
| import numpy as np | |||||
| import pandas as pd | |||||
| from pycocotools.coco import COCO | |||||
| from sedna.common.file_ops import FileOps | |||||
| from sedna.common.class_factory import ClassFactory, ClassType | |||||
| __all__ = ('BaseDataSource', 'TxtDataParse', 'CSVDataParse', 'JSONDataParse') | |||||
| class BaseDataSource: | |||||
| """ | |||||
| An abstract class representing a :class:`BaseDataSource`. | |||||
| All datasets that represent a map from keys to data samples should subclass | |||||
| it. All subclasses should overwrite parse`, supporting get train/eval/infer | |||||
| data by a function. Subclasses could also optionally overwrite `__len__`, | |||||
| which is expected to return the size of the dataset.overwrite `x` for the | |||||
| feature-embedding, `y` for the target label. | |||||
| Parameters | |||||
| ---------- | |||||
| data_type : str | |||||
| define the datasource is train/eval/test | |||||
| func: function | |||||
| function use to parse an iter object batch by batch | |||||
| """ | |||||
| def __init__(self, data_type="train", func=None): | |||||
| self.data_type = data_type # sample type: train/eval/test | |||||
| self.process_func = None | |||||
| if callable(func): | |||||
| self.process_func = func | |||||
| elif func: | |||||
| self.process_func = ClassFactory.get_cls( | |||||
| ClassType.CALLBACK, func)() | |||||
| self.x = None # sample feature | |||||
| self.y = None # sample label | |||||
| self.meta_attr = None # special in lifelong learning | |||||
| def num_examples(self) -> int: | |||||
| return len(self.x) | |||||
| def __len__(self): | |||||
| return self.num_examples() | |||||
| def parse(self, *args, **kwargs): | |||||
| raise NotImplementedError | |||||
| @property | |||||
| def is_test_data(self): | |||||
| return self.data_type == "test" | |||||
| def save(self, output=""): | |||||
| return FileOps.dump(self, output) | |||||
| class TxtDataParse(BaseDataSource, ABC): | |||||
| """ | |||||
| txt file which contain image list parser | |||||
| """ | |||||
| def __init__(self, data_type, func=None): | |||||
| super(TxtDataParse, self).__init__(data_type=data_type, func=func) | |||||
| def parse(self, *args, **kwargs): | |||||
| x_data = [] | |||||
| y_data = [] | |||||
| use_raw = kwargs.get("use_raw") | |||||
| for f in args: | |||||
| if not (f and FileOps.exists(f)): | |||||
| continue | |||||
| with open(f) as fin: | |||||
| if self.process_func: | |||||
| res = list(map(self.process_func, [ | |||||
| line.strip() for line in fin.readlines()])) | |||||
| else: | |||||
| res = [line.strip().split() for line in fin.readlines()] | |||||
| for tup in res: | |||||
| if not len(tup): | |||||
| continue | |||||
| if use_raw: | |||||
| x_data.append(tup) | |||||
| else: | |||||
| x_data.append(tup[0]) | |||||
| if not self.is_test_data: | |||||
| if len(tup) > 1: | |||||
| y_data.append(tup[1]) | |||||
| else: | |||||
| y_data.append(0) | |||||
| self.x = np.array(x_data) | |||||
| self.y = np.array(y_data) | |||||
| class CSVDataParse(BaseDataSource, ABC): | |||||
| """ | |||||
| csv file which contain Structured Data parser | |||||
| """ | |||||
| def __init__(self, data_type, func=None): | |||||
| super(CSVDataParse, self).__init__(data_type=data_type, func=func) | |||||
| @staticmethod | |||||
| def parse_json(lines: dict, **kwargs) -> pd.DataFrame: | |||||
| return pd.DataFrame.from_dict([lines], **kwargs) | |||||
| def parse(self, *args, **kwargs): | |||||
| x_data = [] | |||||
| y_data = [] | |||||
| label = kwargs.pop("label") if "label" in kwargs else "" | |||||
| usecols = kwargs.get("usecols", "") | |||||
| if usecols and isinstance(usecols, str): | |||||
| usecols = usecols.split(",") | |||||
| if len(usecols): | |||||
| if label and label not in usecols: | |||||
| usecols.append(label) | |||||
| kwargs["usecols"] = usecols | |||||
| for f in args: | |||||
| if isinstance(f, (dict, list)): | |||||
| res = self.parse_json(f, **kwargs) | |||||
| else: | |||||
| if not (f and FileOps.exists(f)): | |||||
| continue | |||||
| res = pd.read_csv(f, **kwargs) | |||||
| if self.process_func and callable(self.process_func): | |||||
| res = self.process_func(res) | |||||
| if label: | |||||
| if label not in res.columns: | |||||
| continue | |||||
| y = res[label] | |||||
| y_data.append(y) | |||||
| res.drop(label, axis=1, inplace=True) | |||||
| x_data.append(res) | |||||
| if not x_data: | |||||
| return | |||||
| self.x = pd.concat(x_data) | |||||
| self.y = pd.concat(y_data) | |||||
| class JSONDataParse(BaseDataSource, ABC): | |||||
| """ | |||||
| json file which contain Structured Data parser | |||||
| """ | |||||
| def __init__(self, data_type, func=None): | |||||
| super(JSONDataParse, self).__init__(data_type=data_type, func=func) | |||||
| self.data_dir = None | |||||
| self.coco = None | |||||
| self.ids = None | |||||
| self.class_ids = None | |||||
| self.annotations = None | |||||
| def parse(self, *args, **kwargs): | |||||
| DIRECTORY = "train" | |||||
| LABEL_PATH = "*/gt/gt_val_half.txt" | |||||
| filepath = Path(*args) | |||||
| self.data_dir = Path(Path(filepath).parents[1], DIRECTORY) | |||||
| self.coco = COCO(filepath) | |||||
| self.ids = self.coco.getImgIds() | |||||
| self.class_ids = sorted(self.coco.getCatIds()) | |||||
| self.annotations = [self.load_anno_from_ids(_ids) for _ids in self.ids] | |||||
| self.x = { | |||||
| "data_dir": self.data_dir, | |||||
| "coco": self.coco, | |||||
| "ids": self.ids, | |||||
| "class_ids": self.class_ids, | |||||
| "annotations": self.annotations, | |||||
| } | |||||
| self.y = list(self.data_dir.glob(LABEL_PATH)) | |||||
| def load_anno_from_ids(self, id_): | |||||
| im_ann = self.coco.loadImgs(id_)[0] | |||||
| width = im_ann["width"] | |||||
| height = im_ann["height"] | |||||
| frame_id = im_ann["frame_id"] | |||||
| video_id = im_ann["video_id"] | |||||
| anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) | |||||
| annotations = self.coco.loadAnns(anno_ids) | |||||
| objs = [] | |||||
| for obj in annotations: | |||||
| if obj["area"] > 0 and obj["bbox"][2] >= 0 and obj["bbox"][3] >= 0: | |||||
| obj["clean_bbox"] = [ | |||||
| obj["bbox"][0], obj["bbox"][1], | |||||
| obj["bbox"][0] + obj["bbox"][2], | |||||
| obj["bbox"][1] + obj["bbox"][3]] | |||||
| objs.append(obj) | |||||
| res = np.zeros((len(objs), 6)) | |||||
| for i, obj in enumerate(objs): | |||||
| res[i, 0:4] = obj["clean_bbox"] | |||||
| res[i, 4] = self.class_ids.index(obj["category_id"]) | |||||
| res[i, 5] = obj["track_id"] | |||||
| file_name = ( | |||||
| im_ann["file_name"] if "file_name" in im_ann | |||||
| else f"{id_:012}.jpg") | |||||
| img_info = (height, width, frame_id, video_id, file_name) | |||||
| del im_ann, annotations | |||||
| return (res, img_info, file_name) | |||||