diff --git a/README.md b/README.md index 74090646..c99e5f15 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) -fastNLP是一款轻量级的自然语言处理(NLP)工具包,目标是快速实现NLP任务以及构建复杂模型。 +fastNLP是一款面向自然语言处理(NLP)的轻量级框架,目标是快速实现NLP任务以及构建复杂模型。 fastNLP具有如下的特性: diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index b362f23b..9c9e505b 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -474,8 +474,8 @@ class DataSet(object): if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)): raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self) - 1}") data_set = DataSet() - for field in self.field_arrays.values(): - data_set.add_field(field_name=field.name, fields=field.content[idx], padder=field.padder, + for field_name, field in self.field_arrays.items(): + data_set.add_field(field_name=field_name, fields=field.content[idx], padder=field.padder, is_input=field.is_input, is_target=field.is_target, ignore_type=field.ignore_type) data_set.collater = self.collater.copy_from(self.collater) return data_set @@ -616,6 +616,7 @@ class DataSet(object): if len(self) != len(fieldarray): raise RuntimeError(f"The field to add must have the same size as dataset. " f"Dataset size {len(self)} != field size {len(fieldarray)}") + fieldarray.name = field_name self.field_arrays[field_name] = fieldarray def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False): @@ -673,6 +674,7 @@ class DataSet(object): if not self.has_field(field_name): raise KeyError(f"Field:{field_name} not found in DataSet.") fieldarray = deepcopy(self.get_field(field_name)) + fieldarray.name = new_field_name self.add_fieldarray(field_name=new_field_name, fieldarray=fieldarray) return self diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index a26673b2..74ac7028 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -32,6 +32,7 @@ from .utils import _build_args from .utils import _build_fp16_env from .utils import _get_func_signature from .utils import _move_dict_value_to_device +from .sampler import Sampler __all__ = [ 'get_local_rank', @@ -54,7 +55,7 @@ def get_local_rank(): raise RuntimeError('Please use "python -m torch.distributed.launch --nproc_per_node=N train_script.py') -class DistTrainer(): +class DistTrainer: r""" 分布式的 Trainer,支持分布式训练和混合精度的训练。具体实现原理请阅读 pytorch 官方文档。 @@ -68,11 +69,11 @@ class DistTrainer(): dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, save_path=None, device='auto', - fp16=False, use_tqdm=True, **kwargs): + fp16=False, use_tqdm=True, sampler=None, **kwargs): r""" :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 - :param nn.modules model: 待训练的模型 + :param nn.modules, DDP model: 待训练的模型 :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` :param list callbacks_all: 用于在train过程中起调节作用的回调函数,作用于所有训练进程中。 @@ -101,13 +102,18 @@ class DistTrainer(): :param str device: 指定 device,可以是 gpu,cpu 或 auto :param bool fp16: 指定是否使用半精度训练。 :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 + :param Sampler sampler: 使用的sampler,如果不指定,默认使用的DistributedSampler。使用这个参数的情况一般为,明确修改了每个 + rank的Dataset,使得每个rank上的dataset虽然sample数量一样多,但是sample其实不一样。 :param kwargs: 支持配置可选参数 bool test_use_tqdm: 在dev上验证的时候是否开启tqdm Sampler test_sampler: 在evaluate的时候使用的sampler int dev_batch_size: 在evaluate时,使用的evaluate的batch大小 bool test_use_fp16: test时使用fp16 bool set_grad_to_none: zero_grad时将grad设为None而不是0 - GradScaler gradscaler: 自定义的梯度 scaler + GradScaler grad_scaler: 自定义的梯度 scaler + bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快数据速度。一般在tensor较多或tensor维度较大时,有速度增益。 + bool find_unused_parameters: 在将model转化为DistributedDataParallel类型的时候,需要填入该参数,除非model内确实有 + forward没用上的参数,否则应该不需要用到该参数。 """ assert device in ['auto', 'cuda', 'cpu'], "Please set correct device in [auto', 'cuda', 'cpu']" if device == 'auto': @@ -126,6 +132,9 @@ class DistTrainer(): self.rank = dist.get_rank() # unique id for each process self.train_data = train_data + self.kwargs = kwargs + if kwargs.get('batch_size', None): + batch_size_per_gpu = int(kwargs.get('batch_size')) self.batch_size_per_gpu = int(batch_size_per_gpu) self.n_epochs = int(n_epochs) self.num_data_workers = int(num_workers) @@ -137,7 +146,6 @@ class DistTrainer(): self.losser = _prepare_losser(loss) self.fp16 = fp16 self.local_rank = get_local_rank() - self._forward_func = model.forward self.callback_manager = DistCallbackManager( env={"trainer": self}, callbacks_all=callbacks_all, callbacks_master=callbacks_master) @@ -145,34 +153,50 @@ class DistTrainer(): self.metric_key = metric_key self.use_tqdm = use_tqdm - model.to(self.device) - # init fp16, must before DataParallel init autocast, GradScaler = _build_fp16_env(dummy=not self.fp16) self.auto_cast = autocast - user_grad_scaler = getattr(kwargs, 'gradscaler', None) + user_grad_scaler = kwargs.get('grad_scaler', None) if user_grad_scaler is not None: - assert self.fp16, "must set fp16=True to enable gradscaler" + assert self.fp16, "must set fp16=True to enable grad_scaler" grad_scaler = user_grad_scaler else: grad_scaler = GradScaler() self.grad_scaler = grad_scaler - self.set_grad_to_none = getattr(kwargs, 'set_grad_to_none', True) - + self.set_grad_to_none = kwargs.get('set_grad_to_none', False) # init DataParallel - if parse_version(torch.__version__)>=parse_version('1.1'): - self.ddp_model = DDP(model, device_ids=[self.local_rank], - output_device=self.local_rank, find_unused_parameters=True) + if isinstance(model, DDP): + self.ddp_model = model else: - self.ddp_model = DDP(model, device_ids=[self.local_rank], - output_device=self.local_rank) + model.to(self.device) + if parse_version(torch.__version__)>=parse_version('1.1'): + self.ddp_model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank, + find_unused_parameters=kwargs.get('find_unused_parameters', False)) + else: + self.ddp_model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank) self.model = self.ddp_model.module + self._forward_func = self.model.forward + self.model.to(self.device) + optimizer = self._get_optimizer(optimizer) self.optimizer = optimizer if isinstance(self.train_data, DataSet): - self.sampler = DistributedSampler(self.train_data) + if sampler is None: + self.sampler = DistributedSampler(self.train_data) + else: + # sampler check + if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)): + raise ValueError( + f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}") + elif hasattr(sampler, 'set_batch_size'): + sampler.set_batch_size(batch_size_per_gpu) + self.sampler = sampler + # concerning issue from https://github.com/pytorch/pytorch/issues/57273 + self.pin_memory = kwargs.get('pin_memory', False if parse_version(torch.__version__)==parse_version('1.9') else True) self.data_iterator = self._get_data_iter(self.train_data) self.batch_size = self.world_size * self.batch_size_per_gpu self.n_steps = self._get_n_steps() @@ -180,18 +204,16 @@ class DistTrainer(): self.dev_data = dev_data self.metrics = metrics self.test_use_tqdm = True - self.kwargs = kwargs self.test_use_tqdm = kwargs.get('test_use_tqdm', self.use_tqdm) dev_batch_size = kwargs.get('dev_batch_size', batch_size_per_gpu) # for evaluation, only run eval on master proc if dev_data and metrics: cb = _TesterCallback( - dev_data, model, metrics, + dev_data, self.model, metrics, batch_size=dev_batch_size, num_workers=num_workers, sampler=kwargs.get('test_sampler', None), use_tqdm=self.test_use_tqdm) self.test_manager.add_callback([cb], master=True) - # Setup logging # 同步start_time sync_time = torch.tensor(time.time(), dtype=torch.double).to(self.device) @@ -211,29 +233,14 @@ class DistTrainer(): self.logger.info("Num of processes: {}".format(self.world_size)) self.logger.info("Use device: {}".format(device)) - def _maybe_no_sync(self): - """ - Whenever *samples* contains more than one mini-batch, we - want to accumulate gradients locally and only call - all-reduce in the last backwards pass. - """ - i = self.step % self.update_every - if ( - self.world_size > 1 - and hasattr(self.ddp_model, "no_sync") - and i != 0 - ): - return self.ddp_model.no_sync() - else: - return contextlib.ExitStack() # dummy contextmanager - def _get_n_steps(self): return len(self.data_iterator) * self.n_epochs def _get_data_iter(self, dataset): if isinstance(dataset, DataSet): return DataSetIter(dataset=dataset, batch_size=self.batch_size_per_gpu, sampler=self.sampler, - num_workers=self.num_data_workers, drop_last=self.drop_last) + num_workers=self.num_data_workers, drop_last=self.drop_last, + pin_memory=self.pin_memory) elif isinstance(dataset, BatchIter): return dataset else: @@ -339,6 +346,7 @@ class DistTrainer(): avg_loss = 0 data_iterator = self.data_iterator self.ddp_model.zero_grad() + self.batch_per_epoch = self.data_iterator.num_batches for epoch in range(1, self.n_epochs + 1): self.epoch = epoch pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) @@ -346,38 +354,42 @@ class DistTrainer(): self.callback_manager.on_epoch_begin() for batch_x, batch_y in data_iterator: self.step += 1 - self.ddp_model.train() - _move_dict_value_to_device(batch_x, batch_y, device=self.device) - indices = data_iterator.get_batch_indices() - # negative sampling; replace unknown; re-weight batch_y - self.callback_manager.on_batch_begin(batch_x, batch_y, indices) - with self.auto_cast(): - prediction = self._data_forward(self.ddp_model, batch_x) - # edit prediction - self.callback_manager.on_loss_begin(batch_y, prediction) - loss = self._compute_loss(prediction, batch_y) - - avg_loss += loss.detach() - - # Is loss NaN or inf? requires_grad = False - self.callback_manager.on_backward_begin(loss) - self.grad_scaler.scale(loss).backward() - self.callback_manager.on_backward_end() - if self.step % self.update_every == 0: + if self.step%self.update_every!=0: + no_sync = self.ddp_model.no_sync + else: + no_sync = contextlib.ExitStack + with no_sync(): + self.ddp_model.train() + _move_dict_value_to_device(batch_x, batch_y, device=self.device, non_blocking=self.pin_memory) + indices = data_iterator.get_batch_indices() + # negative sampling; replace unknown; re-weight batch_y + self.callback_manager.on_batch_begin(batch_x, batch_y, indices) + with self.auto_cast(): + prediction = self._data_forward(self.ddp_model, batch_x) + # edit prediction + self.callback_manager.on_loss_begin(batch_y, prediction) + loss = self._compute_loss(prediction, batch_y) + + avg_loss += loss.detach() + + # Is loss NaN or inf? requires_grad = False + self.callback_manager.on_backward_begin(loss) + self._grad_backward(loss) + self.callback_manager.on_backward_end() self._update() - self.callback_manager.on_step_end() + self.callback_manager.on_step_end() - if self.step % self.print_every == 0: - avg_loss = float(avg_loss) / self.print_every - print_output = "loss:{:<6.5f}".format(avg_loss) - pbar.update(self.print_every) - pbar.set_postfix_str(print_output) - avg_loss = 0 + if self.step % self.print_every == 0: + avg_loss = float(avg_loss) / self.print_every + print_output = "loss:{:<6.5f}".format(avg_loss) + pbar.update(self.print_every) + pbar.set_postfix_str(print_output) + avg_loss = 0 - self.callback_manager.on_batch_end() + self.callback_manager.on_batch_end() - if (self.validate_every > 0 and self.step % self.validate_every == 0) and len(self.test_manager.callbacks): - self._do_validation() + if (self.validate_every > 0 and self.step % self.validate_every == 0) and len(self.test_manager.callbacks): + self._do_validation() # ================= mini-batch end ==================== # if self.validate_every < 0 and len(self.test_manager.callbacks): @@ -390,7 +402,7 @@ class DistTrainer(): self.pbar = None # ============ tqdm end ============== # - def _clear_grad_opt(self, optimizer): + def _clear_grad(self, optimizer): if self.set_grad_to_none: for group in optimizer.param_groups: for p in group['params']: @@ -399,13 +411,24 @@ class DistTrainer(): else: optimizer.zero_grad() + def _grad_backward(self, loss): + r"""Compute gradient with link rules. + + :param loss: a scalar where back-prop starts + + For PyTorch, just do "loss.backward()" + """ + if (self.step-1) % self.update_every == 0: + self._clear_grad(self.optimizer) + self.grad_scaler.scale(loss).backward() + def _update(self): r"""Perform weight update on a model. """ - self.grad_scaler.step(self.optimizer) - self.grad_scaler.update() - self._clear_grad_opt(self.optimizer) + if self.step % self.update_every == 0: + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() def _data_forward(self, network, x): x = _build_args(self._forward_func, **x) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 7c671be7..3bce8733 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -320,7 +320,7 @@ class BCEWithLogits(LossBase): :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` :param int class_in_dim: 在序列标注的场景中,pred可能的shape为(batch_size, max_len, num_classes) - 或(batch_size, num_classes, max_len), CrossEntropyLoss需要知道哪一维是class的维度以计算loss。如果为-1,就根据pred的第 + 或(batch_size, num_classes, max_len), BCEWithLogits需要知道哪一维是class的维度以计算loss。如果为-1,就根据pred的第 二维是否等于target的第二维来判断是否需要交换pred的第二维和第三维,因为target的第二维是length的维度,如果这一维度上和pred相等, 那么pred可能第二维也是长度维(存在误判的可能,如果有误判的情况,请显示设置该值)。其它大于0的值则认为该维度是class的维度。 :param str reduction: 支持 `mean` ,`sum` 和 `none` . @@ -340,7 +340,7 @@ class BCEWithLogits(LossBase): pred = pred.transpose(1, 2) else: pred = pred.transpose(-1, self.class_in_dim) - pred = pred.reshape(-1, pred.size(-1)) + pred = pred.reshape(-1) target = target.reshape(-1) return F.binary_cross_entropy_with_logits(input=pred, target=target, reduction=self.reduction) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 4cf83fac..55ffd9cf 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -98,6 +98,7 @@ class Tester(object): :param bool fp16: 是否使用float16进行验证 :param kwargs: Sampler sampler: 支持传入sampler控制测试顺序 + bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快数据速度。 """ super(Tester, self).__init__() @@ -112,6 +113,7 @@ class Tester(object): self.verbose = verbose self.use_tqdm = use_tqdm self.logger = logger + self.pin_memory = kwargs.get('pin_memory', True) if isinstance(data, DataSet): sampler = kwargs.get('sampler', None) @@ -122,7 +124,8 @@ class Tester(object): if hasattr(sampler, 'set_batch_size'): sampler.set_batch_size(batch_size) self.data_iterator = DataSetIter(dataset=data, batch_size=batch_size, sampler=sampler, - num_workers=num_workers) + num_workers=num_workers, + pin_memory=self.pin_memory) elif isinstance(data, BatchIter): self.data_iterator = data else: @@ -179,7 +182,8 @@ class Tester(object): start_time = time.time() for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device, + non_blocking=self.pin_memory) with self.auto_cast(): pred_dict = self._data_forward(self._predict_func, batch_x) if not isinstance(pred_dict, dict): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 628b9711..f4f8a093 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -334,6 +334,7 @@ try: except: from .utils import _pseudo_tqdm as tqdm import warnings +from pkg_resources import parse_version from .batch import DataSetIter, BatchIter from .callback import CallbackManager, CallbackException, Callback @@ -432,6 +433,7 @@ class Trainer(object): bool set_grad_to_none: 在zero_grad的时候是否将gradient设置为None,而不是设置为zero GradScaler grad_scaler: 仅在fp16为True时有效,如果不使用torch.cuda.amp.GradScaler的初始化参数,可传入一个已经初始化后的 grad_scaler。 + bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快数据速度。 """ super(Trainer, self).__init__() if not isinstance(model, nn.Module): @@ -472,7 +474,8 @@ class Trainer(object): warnings.warn("num_workers is ignored when train_data is BatchIter.") if drop_last: warnings.warn("drop_last is ignored when train_data is BatchIter.") - + # concerning issue from https://github.com/pytorch/pytorch/issues/57273 + self.pin_memory = kwargs.get('pin_memory', False if parse_version(torch.__version__)==parse_version('1.9') else True) if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的 # device为None if device is not None: @@ -502,12 +505,13 @@ class Trainer(object): sampler(train_data) train_data = DataSetIter(train_data, batch_size=1, sampler=None, as_numpy=False, num_workers=num_workers, - pin_memory=False, drop_last=drop_last, timeout=0, worker_init_fn=None, + pin_memory=self.pin_memory, drop_last=drop_last, timeout=0, worker_init_fn=None, batch_sampler=sampler) if isinstance(train_data, DataSet): self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, - num_workers=num_workers, drop_last=drop_last) + num_workers=num_workers, drop_last=drop_last, + pin_memory=self.pin_memory) elif isinstance(train_data, BatchIter): self.data_iterator = train_data train_data = train_data.dataset @@ -599,7 +603,9 @@ class Trainer(object): verbose=0, use_tqdm=self.test_use_tqdm, sampler=kwargs.get('test_sampler', None), - fp16=self.test_use_fp16) + fp16=self.test_use_fp16, + num_workers=num_workers, + pin_memory=self.pin_memory) self.start_time = None # start timestamp @@ -759,6 +765,13 @@ class Trainer(object): # lr decay; early stopping self.callback_manager.on_epoch_end() # =============== epochs end =================== # + if self.dev_data is not None and (self.validate_every>0 and self.n_steps%self.validate_every!=0): + eval_res = self._do_validation(epoch=epoch, step=self.step) + eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}: ".format(epoch, self.n_epochs, self.step, + self.n_steps) + # pbar.write(eval_str + '\n') + self.logger.info(eval_str) + self.logger.info(self.tester._format_eval_results(eval_res) + '\n') pbar.close() self.pbar = None # ============ tqdm end ============== # diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 81c0fe42..09c44d6c 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -170,7 +170,7 @@ class StaticEmbedding(TokenEmbedding): if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: - embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) + embedding = self._randomly_init_embed(len(lowered_vocab), embedding_dim, init_method) self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 8528ebf8..cfce4de4 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -31,7 +31,8 @@ class DataBundle: r""" :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict - :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict + :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict。建议不要将相同的DataSet对象重复传入,可能会在 + 使用Pipe处理数据的时候遇到问题,若多个数据集确需一致,请手动deepcopy后传入。 """ self.vocabs = vocabs or {} self.datasets = datasets or {} diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 51df1206..b547ce37 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -58,6 +58,8 @@ __all__ = [ "ChnSentiCorpLoader", "THUCNewsLoader", "WeiboSenti100kLoader", + "MRLoader", + "R8Loader", "R52Loader", "OhsumedLoader", "NG20Loader", 'ConllLoader', 'Conll2003Loader', @@ -88,7 +90,8 @@ __all__ = [ ] from .classification import CLSBaseLoader, YelpFullLoader, YelpPolarityLoader, AGsNewsLoader, IMDBLoader, \ SSTLoader, SST2Loader, DBPediaLoader, \ - ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader + ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader,\ + MRLoader, R8Loader, R52Loader, OhsumedLoader, NG20Loader from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader from .coreference import CoReferenceLoader diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index 23af5d8f..7f7a2667 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -11,7 +11,13 @@ __all__ = [ "SST2Loader", "ChnSentiCorpLoader", "THUCNewsLoader", - "WeiboSenti100kLoader" + "WeiboSenti100kLoader", + + "MRLoader", + "R8Loader", + "R52Loader", + "OhsumedLoader", + "NG20Loader", ] @@ -512,3 +518,123 @@ class WeiboSenti100kLoader(Loader): """ output_dir = self._get_dataset_path('weibo-senti-100k') return output_dir + +class MRLoader(CLSBaseLoader): + def __init__(self): + super(MRLoader, self).__init__() + + def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str: + r""" + 自动下载数据集 + + 如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。 + 下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = r'mr' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + data_dir = _split_dev(dataset_name=dataset_name, + data_dir=data_dir, + dev_ratio=dev_ratio, + re_download=re_download, + suffix='csv') + return data_dir + +class R8Loader(CLSBaseLoader): + def __init__(self): + super(R8Loader, self).__init__() + + def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str: + r""" + 自动下载数据集 + + 如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。 + 下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = r'R8' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + data_dir = _split_dev(dataset_name=dataset_name, + data_dir=data_dir, + dev_ratio=dev_ratio, + re_download=re_download, + suffix='csv') + return data_dir + +class R52Loader(CLSBaseLoader): + def __init__(self): + super(R52Loader, self).__init__() + + def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str: + r""" + 自动下载数据集 + + 如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。 + 下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = r'R52' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + data_dir = _split_dev(dataset_name=dataset_name, + data_dir=data_dir, + dev_ratio=dev_ratio, + re_download=re_download, + suffix='csv') + return data_dir + +class NG20Loader(CLSBaseLoader): + def __init__(self): + super(NG20Loader, self).__init__() + + def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str: + r""" + 自动下载数据集 + + 如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。 + 下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = r'20ng' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + data_dir = _split_dev(dataset_name=dataset_name, + data_dir=data_dir, + dev_ratio=dev_ratio, + re_download=re_download, + suffix='csv') + return data_dir + +class OhsumedLoader(CLSBaseLoader): + def __init__(self): + super(OhsumedLoader, self).__init__() + + def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str: + r""" + 自动下载数据集 + + 如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。 + 下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = r'ohsumed' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + data_dir = _split_dev(dataset_name=dataset_name, + data_dir=data_dir, + dev_ratio=dev_ratio, + re_download=re_download, + suffix='csv') + return data_dir \ No newline at end of file diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 346825ca..94784515 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -23,6 +23,7 @@ __all__ = [ "ChnSentiCorpPipe", "THUCNewsPipe", "WeiboSenti100kPipe", + "MRPipe", "R52Pipe", "R8Pipe", "OhsumedPipe", "NG20Loader", "Conll2003NERPipe", "OntoNotesNERPipe", @@ -59,7 +60,7 @@ __all__ = [ ] from .classification import CLSBasePipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \ - WeiboSenti100kPipe, AGsNewsPipe, DBPediaPipe + WeiboSenti100kPipe, AGsNewsPipe, DBPediaPipe, MRPipe, R8Pipe, R52Pipe, OhsumedPipe, NG20Loader from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .conll import Conll2003Pipe from .coreference import CoReferencePipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 9475a092..41682d3e 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -11,7 +11,8 @@ __all__ = [ 'IMDBPipe', "ChnSentiCorpPipe", "THUCNewsPipe", - "WeiboSenti100kPipe" + "WeiboSenti100kPipe", + "MRPipe", "R8Pipe", "R52Pipe", "OhsumedPipe", "NG20Pipe" ] import re @@ -28,7 +29,7 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _add_chars_field, from ..data_bundle import DataBundle from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader, \ - AGsNewsLoader, DBPediaLoader + AGsNewsLoader, DBPediaLoader, MRLoader, R52Loader, R8Loader, OhsumedLoader, NG20Loader from ...core._logger import logger from ...core.const import Const from ...core.dataset import DataSet @@ -827,3 +828,117 @@ class WeiboSenti100kPipe(CLSBasePipe): data_bundle = data_loader.load(paths) data_bundle = self.process(data_bundle) return data_bundle + +class MRPipe(CLSBasePipe): + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + r""" + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process_from_file(self, paths=None): + r""" + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = MRLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class R8Pipe(CLSBasePipe): + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + r""" + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process_from_file(self, paths=None): + r""" + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = R8Loader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class R52Pipe(CLSBasePipe): + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + r""" + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process_from_file(self, paths=None): + r""" + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = R52Loader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class OhsumedPipe(CLSBasePipe): + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + r""" + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process_from_file(self, paths=None): + r""" + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = OhsumedLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class NG20Pipe(CLSBasePipe): + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + r""" + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process_from_file(self, paths=None): + r""" + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = NG20Loader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle \ No newline at end of file diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index 7416382d..0ff32d83 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -27,15 +27,15 @@ class Pipe: 对输入的DataBundle进行处理,然后返回该DataBundle。 :param ~fastNLP.DataBundle data_bundle: 需要处理的DataBundle对象 - :return: + :return: DataBundle """ raise NotImplementedError - def process_from_file(self, paths) -> DataBundle: + def process_from_file(self, paths: str) -> DataBundle: r""" 传入文件路径,生成处理好的DataBundle对象。paths支持的路径形式可以参考 ::meth:`fastNLP.io.Loader.load()` - :param paths: + :param str paths: :return: DataBundle """ raise NotImplementedError diff --git a/setup.py b/setup.py index 2a75a42c..d4a71c33 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ print(pkgs) setup( name='FastNLP', - version='0.6.0', + version='0.7.0', url='https://gitee.com/fastnlp/fastNLP', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py index d7c95a26..7d38601c 100644 --- a/tests/core/test_dataset.py +++ b/tests/core/test_dataset.py @@ -345,6 +345,14 @@ class TestDataSetMethods(unittest.TestCase): ds.apply_field(lambda x: x, 'idx', 'idx') self.assertTrue(isinstance(ds.get_field('idx').padder, AutoPadder)) # should be None, but AutoPadder + def test_instance_field_disappear_bug(self): + data = DataSet({'raw_chars': [[0,1],[2]], 'target': [0, 1]}) + data.copy_field(field_name='raw_chars', new_field_name='chars') + _data = data[:1] + for field_name in ['raw_chars', 'target', 'chars']: + self.assertTrue(_data.has_field(field_name)) + + class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) diff --git a/tests/data_for_tests/io/20ng/dev.csv b/tests/data_for_tests/io/20ng/dev.csv new file mode 100644 index 00000000..1cfb7c56 --- /dev/null +++ b/tests/data_for_tests/io/20ng/dev.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject clarification organization cookamunga tourist bureau lines 14 sorry , san jose based rosicrucian order called r c , n't remember time stand r c ordo rosae crucis , words latin order rose cross sigh , seems loosing long term memory otherwise headquarters san jose pretty decent metaphysical bookstore , interested books son loves run around egyptian museum cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"subject catholic lit nunnally acs harding edu \( john nunnally \) distribution world organization harding university , , ar nntp posting host acs harding edu x news reader vms news 1 reply dlphknob camelot bradley edu 's message 16 apr 93 18 57 20 gmtlines 45 lines 45 dlphknob camelot dlphknob camelot bradley edu writes 1993apr14 476 mtechca maintech com foster mtechca maintech com writes surprised saddened would expect kind behavior evangelical born gospel thumping face 're true christian protestants , always thought catholics behaved better please stoop level e b g f w c protestants , think best way witness strident , intrusive , loud , insulting self righteous \( pleading mode \) please ! i'm begging ! quit confusing religious groups , stop making generalizations ! i'm protestant ! i'm evangelical ! n't believe way way ! i'm creation scientist ! n't think homosexuals hung ! want discuss bible thumpers , would better singling \( making generalizations \) fundamentalists compared actions methodists southern baptists , would think different religions ! sarcasm sure pick correct groups bible thumpers , fundamentalists , southern baptists deserve hasty generalizations prejudicial statements n't pick methodists ! sarcasm please , prejudice thinking people group , please n't write protestants evangelicals ! \( pleading mode \) god wish could get ahold thomas stories n , n tha gb , gb n yvan sasha david cole iv chief research dlphknob camelot bradley edu" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject alt sex stories literary critical analysis \) organization cookamunga tourist bureau lines 16 article h7v agate berkeley edu , dzkriz ocf berkeley edu \( dennis kriz \) wrote i'm going try something , perhaps many would thought even possible want begin process initiating literary critical study pornography posted alt sex stories , identify major themes motifs present stories posted opening possibility objective moral evaluation material present dennis , i'm astounded n't know interested even study filth alt sex stories provide cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"anthony landreneau ozonehole com \( anthony landreneau \) subject abortion distribution world organization ozone online operations , inc , dba ozone hole bbs reply anthony landreneau ozonehole com \( anthony landreneau \) lines 21 margoli watson ibm com \( larry margolis \) anthony landreneau ozonehole com lm rape passed , nothing ever take away lm true forcing remain pregnant continues violation lm body another 9 months see unbelievably cruel life violation cruel , killing living solely friend right cold anthony slmr 2 1 's difference orange \? ozone hole bbs private bulletin board service \( 504 \) 891 3142 3 full service nodes usrobotics 16 8k bps 10 gigs 100 , 000 files skydive new orleans ! rime network mail hub 500 usenet newsgroups please route questions inquiries postmaster ozonehole com" +talk.religion.misc,"kevin rotag mi org \( kevin darcy \) subject 2000 years , say christian morality organization , \? \? \? lines 15 article pww spac at1 59 rice edu pww spacsun rice edu \( peter walker \) writes article 1993apr18 rotag mi org , kevin rotag mi org \( kevin darcy \) wrote , one , considered intentionality primary ontological stuff built perceptions , consciousness , thoughts , etc frank means alone seeing intentionality \( values , puts \) underlying human experience , even called objective experiences , measurements natural world , output des chip others us see intellectual masturbation 'll defer greater firsthand knowledge matters kevin" +talk.religion.misc,"bil okcforum osrhe edu \( bill conner \) subject 2000 years , say christian morality nntp posting host okcforum osrhe edu organization okcforum unix users group x newsreader tin version 1 1 pl9 lines 54 mind , say science basis values bit reach science basis observable fact 'd say one chooses observe observation interpreted significance 's given depends great deal values observer science human activity , subject potential distortion human activity myth scientists moral influence ethical concern , knowledge whole pure nature biases scientist , nonsense bill one argue objective values \( moral sense \) one must first start demonstrating morality objective considering meaning word objective doubt ever happen , back original question objective morality \? may unfortunate choice words , almost self contradictory objective sense used means something immutable absolute morality describes behavior group people first term inclusive , second specific concept supposedly described may meaning however god described christians \( instance \) , existence apart independent humankind existence outside frame reference \( reality \) declares thing , necessarily since defined omnipotent , claims believed , least omnipotent relative us god intrinsically self defined reality whatever says objective sense god determines standard conduct , standard objective human beings held accountable conformance standard permitted ignore , substitute relative morality mode conduct , giving term morality nebulous , meaningless sense argued pretending misunderstand standard objective conduct required meet standard therefore objectively determined convenient pretend term morality infinitely , n't mean objective standard n't exist morality come mean little cultural norm , preferred conduct decent people , making seem subjective , derived absolute , objective , standard ironically , objective standard perfect accord true nature \( according christianity least \) , yet condemned contrary human , oppressive severe may due bill much amoral standard , like , 's x" diff --git a/tests/data_for_tests/io/20ng/test.csv b/tests/data_for_tests/io/20ng/test.csv new file mode 100644 index 00000000..b636bc65 --- /dev/null +++ b/tests/data_for_tests/io/20ng/test.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 43 article 1993apr15 wam umd edu , wam umd edu \( jay stein objectively subjective \) writes horus ap mchp sni de frank d012s658 uucp \( frank o'dwyer \) discussion christianity objective morals question effective difference objective values exist , disagreement values subjective \? n't see difference saying absolute truth exists , people think lie truth relative \? think examples , first statement fundamental disagreement least two people second statement agreed upon put another way , someone says objective values exist agree values subjective jim halat" +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 17 article na4 horus ap mchp sni de , frank d012s658 uucp \( frank o'dwyer \) writes really \? n't know objective value \? offered people u , collectively , 1 land america , would sound like good deal \? happens subjective example people us would happen agree continue move price point people would accept probably would accept high enough number endpoints subjective scale given homes objective viewpoints jim halat" +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 34 article horus ap mchp sni de , frank d012s658 uucp \( frank o'dwyer \) writes firstly , science basis values , way round better explain objective atoms , get subjective values , go atoms objective n't even real scientists call atom nothing mathematical model describes certain physical , observable properties surroundings subjective objective , though , approach scientist takes discussing model observations objective science objective approach subjectively selected scientist objective case means specified , unchanging set rules colleagues use discuss science contrast objective morality may objective approach subjectively discuss beliefs morality exists objective morality also , science deals discuss observations physical world around us method discussion objective \( science discussion \) science makes claims know even sometimes observe simply gives us way discuss surroundings meaningful , consistent way think bohr said \( paraphrase \) science say physical world jim halat" +talk.religion.misc,"mwilson ncratl atlantaga ncr com \( mark wilson \) subject message mr president know happened \? organization ncr engineering manufacturing atlanta atlanta , ga lines 58 noose ecn purdue edu tbrent bank ecn purdue edu \( timothy j brent \) writes probably , n't pack heavy weaponry intent use please cite evidence intending use n't really think allowed keep stuff \? \? , tell live sure steer well clear check sig public also rights , placed individual society rights individuals rights go ahead , call commie , ok , commie 'd singing different tune exercised right rape daughter think right rape anyone \? wonder n't care others broke law , please indicate law feel koresh broke , convicted said crime threat society , feel owning guns makes threat society ou going start going knives baseball bats well feel someone spouts unpopular ideas definition threat society job simple simple think job assualt civilians support first , second , fourth , fifth , sixth , eighth amendment rights , lest taken away fbi davidians think 'll support \( except 2 \) words n't support mob rule n't prettier merely mob calls government ai n't charity using someone else 's money wilson 's theory relativity go back far enough , 're related mark wilson atlantaga ncr com" +talk.religion.misc,"alizard tweekco uucp \( lizard \) subject 14 apr 93 god 's promise 1 john 1 7 organization com systems bbs , , ca \( 510 \) 631 lines 20 starowl rahul net \( michael adams \) writes anyone netland process devising new religion , use lamb bull , already reserved please choose another animal , preferably one endangered species list washed blood barney dinosaur \? \) judging postings 've read usenet non usenet bbs conferences , barney definitely endangered species especially runs dark alley lizard lizard internet addresses alizard tweekco boo pacbell com \( preferred \) pacbell com ! boo ! tweekco ! alizard \( bang path \) alizard com \( backup \) pgp2 2 public key available request" +talk.religion.misc,"alizard tweekco uucp \( lizard \) subject oto , ancient order oriental templars organization com systems bbs , , ca \( 510 \) 631 lines 18 thyagi cup portal com \( thyagi morgoth nagasiva \) writes organization known present time ancient order oriental templars ordo templi orientis otherwise hermetic brotherhood light organization official e mail address days \? \( address sf bay area lodges , e g would \) 93 lizard lizard internet addresses alizard tweekco boo pacbell com \( preferred \) pacbell com ! boo ! tweekco ! alizard \( bang path \) alizard com \( backup \) pgp2 2 public key available request" diff --git a/tests/data_for_tests/io/20ng/train.csv b/tests/data_for_tests/io/20ng/train.csv new file mode 100644 index 00000000..55307ad6 --- /dev/null +++ b/tests/data_for_tests/io/20ng/train.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"deane binah cc brandeis edu \( david matthew deane \) subject flaming nazis reply deane binah cc brandeis edu organization brandeis university lines 106 okay , 'll bite probably leave alone , heck article 1993apr14 422 sun0 urz uni heidelberg de , gsmith lauren iwr uni heidelberg de \( gene w smith \) writes article brewich hou tx us popec brewich hou tx us \( pope charles \) writes name guy responsible much uniforms , props used early nazis rallies name roehm , hitler claim came swastika business n't credit actual flag design party member dentist \? believe gives credit mein kampf killed early nazi purge many associates flaming homosexuals well know also trying find actual evidence common assertion recently postings groups soc history soc culture german uncovered net experts could provide well , i'm expert , histories nazi germany assert make reference several scandals occurred long night long knives impression got homosexuality portions sa common knowledge also , book \( homosexual author whose name escapes moment \) called homosexuals history asserts roehm heines homosexuals , well others roehm 's sa circle books say roehm associate , edmund heines , homosexual able find nothing beyond , suspect sort historical urban legend well , 're one germany n't believe history books , look primary sources us outside germany access seems plenty documented instances several scandals , fact knight long knives several sa members \( including heines \) found sleeping together , etc also believe people complaining sa 's homosexual activities \( young boys , etc \) histories 've read make convincing case none sounds like urban legend \( irving , notoriously unreliable historian , says funk , nazi finance minister , homosexual gives sources \) know next nothing irving nothing funk precisely know , would contradict history books read concerning existence homosexual nazis \? trying say historians taking part anti homosexual smear \? homosexual writers agree official history \? n't think would found truth roehm heines homosexuals \? would think would want homosexuality nazism one use connection two bash homosexuals case challenge anyone document claim going challenge historians point \( irving \) , burden proof track references find stories originate , one germany , close archival material people net found great deal evidence many flaming heterosexuals among nazis seems include worst ones hitler , himmler , goebbels , goering , , eichmann , many eh \? agenda \? prove nazis heterosexuals , bash heterosexuals \? bother nazis might homosexuals \? make homosexuals bad true \? course bisexuals \? half nazis \? n't know would difficult believe nazis homosexuals german officer corps ww1 , instance , notorious homosexuality numerous scandals rocked german govt late 19th early 20th century many kaiser 's friends prosecuted kaiser homosexual , germany army long tradition homosexuality , going far back prussian history back frederick great least , homosexual roehm product prussian officer tradition , old german army \( like english public school system \) , well known center homosexuality , would quite willing overlook roehm 's homosexuality addition , nazis complained homosexuality hitler youth hitler youth swallowed pre nazi youth groups , various pre war , bund , youth groups known promote ideals friendship , many cases , homosexuality seems unlikely plenty homosexual nazis , regardless official nazi dogmas concerning evils homosexuality suprise anyone \? homosexuality always existed , societies would unusual nazis exception , n't sources , think kind proof accept would citations archival material , access intend reread every book nazis modern homosexuality ever read n't time nothing stopping , however , chasing sources prove otherwise , though , stick established histories david matthew deane \( deane binah cc brandeis edu \) eternal bleak wind let gods speak softly us days hereafter \( ezra pound \)" +talk.religion.misc,"psyrobtw ubvmsd cc buffalo edu \( robert weiss \) subject 18 apr 93 god 's promise philippians 4 9 organization university buffalo lines 8 news software vax vms vnews 1 41 nntp posting host ubvmsd cc buffalo edu things , ye learned , received , heard , seen , god peace shall philippians 4 9" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject 14 apr 93 god 's promise 1 john 1 7 organization cookamunga tourist bureau lines 14 article tweekco uucp , alizard tweekco uucp \( lizard \) wrote judging postings 've read usenet non usenet bbs conferences , barney definitely endangered species especially runs dark alley please , please n't make barney modern martyr saviour mythical figure , humans create religion name , life unbearable \) cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject disillusioned protestant finds christ organization cookamunga tourist bureau lines 23 article boi hp com , jburrill boi hp com \( jim burrill \) wrote jesus never taught concept trinity , deal following mat 28 18 jesus came said , authority heaven earth given mat 28 19 therefore go make disciples nations , baptizing name father son holy spirit , mat 28 20 teaching obey everything commanded surely always , end age jim , please , 's lame explanation trinity jesus provides baptizing people name three things ! trinity case , i'm wrong , assumed trinity implies god three entities , yet cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"cutter gloster via mind org \( cutter \) subject biblical backing koresh 's 3 02 tape \( cites enclosed \) distribution world organization gordian knot , gloster , ga lines 22 netd susie sbc com \( \) writes article 20apr199301460499 utarlg uta edu b645zaw utarlg uta edu \( stephen think david koresh n't solid structure , sound biblical backing hour long tape broadcast , n't think anyone really cares solid structure sermon 's deaths 's responsible concern people think ought hold christ followers died hand romans also fault believing god , society reminds roman empire every day guess 'll log go watch american cutter gloster via mind org \( chris \) jobs easy person n't holt 's law" +talk.religion.misc,"subject albert sabin rfox charlie usd edu \( rich fox , univ south dakota \) reply rfox charlie usd edu organization university south dakota computer science dept nntp posting host charlie lines 91 article 1993apr15 nntpd2 cxo dec com , sharpe enet dec com \( system privileged account \) writes article 885 sunfish usd edu , rfox charlie usd edu \( rich fox , univ south dakota \) writes article 1993apr10 rambo atlanta dg com , wpr atlanta dg com \( bill rawlins \) writes earlier dialogue deleted perhaps read stop advancing bible evidence relating questions science jesus exist \? g wells great fallacy statement question origins based science alone nope , fallacy yep , science best determining religions handle rich , curious others award custody baby theists religion \? hope n't award custody , rich purposely used handle order avoid e , happens religions \( course like scientific creationism \) used best part indicate science currently time , domains mostly ignored also attempted brief , doubt confused matter aside , science written nobody seems argue theists , theologians better investigate magicians , , , athiests agnostics seems answer would vary individual individual i'm trying evasive , societal perspective , religion works hand , sometimes abused misused , many suffer , know net result seems positive , anthropological perspective human affairs might call neo insofar think masses ca n't get along without religion generally incapable n't , myriad reasons , main one seems promise immortality , immortality therefore seems theologians better equipped others mention answers suggest holds regardless truth answers simply people believe end , spiritual beliefs real scientific facts explanation \( caution take context \) suggest forever closed scientific investigation \? fact , n't think closed , least individuals n't group theoretical physicists argue matter created nothing big bang singularity \? approach might absence , except seems could argued something responsible nothing \? maybe something n't supernatural , maybe 's tough one people today grasp case , theory without empirical data explanation , question require data words , agree theorizing \( within scientific parameters \) scientific explaining answer , closed scientists , sense science currently inadequate data necessary improvement , seems long way , ever pretty convoluted hope 've made sense seems 200 years ago , question origin life earth considered open scientific agree generally prefer put way questions , , open inquiry enlightenment , reason questioning theological answers , , part , science thus born curiosity , eventually away largely leaving behind ignorant , selfish , intolerant , arrogant , course , still claim authority four domains rich fox , anthro , usouthdakota like discussion around , figure original post \) much obliged funny facts tend things , n't \? well , sure plenty scientific creationist somewhere , even created nothing record , , modern humans best regards \) , rich fox , anthro , usouthdakota" diff --git a/tests/data_for_tests/io/R52/dev.csv b/tests/data_for_tests/io/R52/dev.csv new file mode 100644 index 00000000..37eab6ad --- /dev/null +++ b/tests/data_for_tests/io/R52/dev.csv @@ -0,0 +1,6 @@ +trade,canadians urge exemption u trade bill group canadian lawmakers ontario today asked u counterparts exempt canada mandatory trade retaliation provisions major trade bill considered u congress meeting northeast midwest coalition organization u legislators david cooke chairman ontario parliament select committee economic affairs said exemption would help trade relations trade legislation considered full house late april would require president reagan retaliate foreign unfair trade practices unless trade actions would harm u economy currently reagan reject trade sanctions grounds cooke member liberal party told u congressmen understand trade bill think concerns parts world would suggest best concerns canada consider country bill added canada united states largest trading partner two way trade billion dlrs according coalition u ran billion dlr deficit manufactured goods year compared billion dlr surplus services trade reuter +earn,american corp nd qtr feb shr profit one cts vs loss three cts net profit vs loss revs mln vs mln six months shr profit six cts vs loss six cts net profit mln vs loss mln revs mln vs mln note six months includes gain four cts change accounting principle reuter +earn,meyers co increases dividend qtly div eight cts vs seven cts prior payable may record april reuter +earn,meyers co year feb shr dlrs vs dlrs net mln dlrs vs mln revs mln vs mln note results reflect year month period company changed fiscal year end february march reuter +earn,kelly oil gas partners year dec shr cts vs cts net mln vs mln revs mln vs mln reuter +money-fx,japan seeks strengthen paris currency accord japan seek strengthen paris accord currency stability meeting group seven leading industrial nations tomorrow japanese officials said however officials japanese finance minister kiichi miyazawa asked identified would provide details wanted accord signed six leading industrial democracies february strengthened currency target zones reference ranges discussed g meeting scheduled tomorrow japanese officials said meeting held conjunction week international monetary fund world bank sessions currency pact need changing language used paris accord officials said miyazawa met u treasury secretary james baker early afternoon discussed dollar yen exchange rates officials said declined disclosed details discussion japanese officials also declined detail miyazawa baker discussed subject greater joint intervention currency markets stabilize dollar independent american intervention officials said money market action stabilize dollar benefit japan suffering sharp appreciation currency also benefit united states well u japan take steps boost domestic demand reduce trade surplus japan explain economic measures g officials said however miyazawa failed outline size japanese economic package meeting baker today japanese budget authorized parliament despite new fiscal year started april one officials said japan ruling liberal democratic party revealed economic package today calling billion yen additional spending reuter diff --git a/tests/data_for_tests/io/R52/test.csv b/tests/data_for_tests/io/R52/test.csv new file mode 100644 index 00000000..99497e79 --- /dev/null +++ b/tests/data_for_tests/io/R52/test.csv @@ -0,0 +1,6 @@ +pet-chem,italy eni invest venezuelan projects italy state owned ente nazionale idrocarburi eni invest mln dlrs two joint ventures coal petroleos de venezuela eni president franco said speaking news conference said two projects eventually bring mln dlrs annually foreign exchange venezuela help diversify country export base joint ventures principal instrument allowing resources industrialized countries developing world lead future growth said eni subsidiary join petrochemical subsidiary pdvsa building mln dlr plant produce gasoline additive used increase octane levels mt per year plant jose eastern venezuela fed butane produced pdvsa eastern complex eni owns pct joint venture company super c pct remaining three pct sold private investors production set begin third quarter officials said plant one saudi arabia another eni subsidiary agip sign letter intent caracas tomorrow enter partnership pdvsa mine coal deposits western state said feasibility studies still done project definitive accord slated august added agip atlantic richfield coal arco subsidiary formed consortium pct project whose total cost estimated mln dlrs company said agip invest pct mln dlrs project said reuter +earn,republicbank rpt brazil loans republicbank corp said placed mln dlrs intermediate term loans brazil non accrual basis march said reclassification reduce first quarter earnings mln dlrs taxes mln dlrs taxes brazil change position moratorium interest payments republicbank also said net income first quarter expected mln dlrs cts share fully diluted basis year ago first quarter company earned mln dlrs cts share company also said first quarter results expected include provision loan losses mln dlrs mln dlrs net loan charge offs mln dlrs said provision increase loan losses mln dlrs pct loans republicbank total assets billion dlrs announced december agreement interfirst corp form first republicbank corp merger approved regulatory agencies stockholders would create th largest bank holding company united states reuter +acq,amoskeag bank seek rehearing amoskeag bank shares inc portsmouth savings bank said file rehearing new hampshire supreme court march ruling state regulatory approval amoskeag acquisition portsmouth decision believe go well beyond affiliation amoskeag portsmouth savings bank said amoskeag chairman william transaction opposed group portsmouth investors wanted bank remain independent according press reports reuter +strategic-metal,doe recommends special unit uranium energy secretary john herrington told congress federally chartered corporation would best way manage operate government uranium program said letter congressmen unless program run energy department improved sales worth five billion dlrs could lost program annual commercial sales one billion dlrs holds pct free world market services department official said world market uranium power utilities increasingly competitive private entity could better tap administration plan spin department uranium operation line effort reduce federal government role areas feels private enterprise could efficient reuter +earn,declares stock dividend financial corp said declared stock dividend one class share two class shares held payable may shareholders record april reuter +acq,allegheny ag shareholders file suit allegheny international inc agreed merge jointly formed first boston inc affiliate deal worth mn dlrs said shareholders preferred stock filed class action complaint company complaint alleges among things company board agreed pay first boston illegal seven mln dlr fee received higher offer company prior buyout suit fee allegheny ability attract offers take actions would benefit holders preferred stock complaint also alleges federal securities laws violations breach fiduciary duty suit requests injunction proceeding pending offer made sunter acquisition acquire allegheny sunter acquisition corp sunter holdings corp formed first boston allegheny allegheny said sunter concerns intend vigorously defend complaint charges complaints filed robert parties believed shares allegheny preferred stock reuter diff --git a/tests/data_for_tests/io/R52/train.csv b/tests/data_for_tests/io/R52/train.csv new file mode 100644 index 00000000..34af13dc --- /dev/null +++ b/tests/data_for_tests/io/R52/train.csv @@ -0,0 +1,6 @@ +earn,convertible securities sets dividend convertible securities fund inc said board declared initial quarterly dividend three cents per share payable april shareholders record april said anticipates paying regular quarterly dividend company made initial public stock offering march five reuter +jobs,n z unemployment rate pct december quarter new zealand unemployment rate pct workforce quarter ended december unchanged revised pct preliminary pct previous quarter slightly pct year earlier quarter statistics department said department citing household labour force survey said statement number unemployed october december september quarter year earlier reuter +rubber,japan rubber stocks fall march japan rubber stocks fell tonnes march february march japan rubber trade association said stocks tonnes february year earlier comparisons march feb march crude rubber synthetic latex reuter +money-fx,south korean fixed month high bank korea said fixed dollar highest level since february set yesterday risen pct dollar far year rising pct reuter +copper,nippon mining lowers copper price nippon mining co ltd said lowered selling price electrolytic copper yen per tonne effective immediately reuter +ship,australian unions launch new south wales strikes australian trade unions said launched week long strikes industrial action new south wales nsw protest new laws would reduce injury compensation payments union sources said talks state government broke last night two sides scheduled meet later today attempt find compromise rail freight shipping cargo movements country state first affected union officials said almost every business sector hit unless quick settlement state government recently introduced new workers compensation act would cut cash benefits injured workers third act awaiting parliamentary ratification nsw state premier said workers compensation risen recent years proposed cuts would save hundreds mlns dollars year union officials said industrial action could spread states federal government also plans make sharp cuts workers compensation reuter diff --git a/tests/data_for_tests/io/R8/dev.csv b/tests/data_for_tests/io/R8/dev.csv new file mode 100644 index 00000000..b7271c38 --- /dev/null +++ b/tests/data_for_tests/io/R8/dev.csv @@ -0,0 +1,6 @@ +acq,amoskeag bank seek amoskeag bank shares inc portsmouth savings bank said file new hampshire supreme court march ruling state regulatory approval amoskeag acquisition portsmouth decision believe go well beyond affiliation amoskeag portsmouth savings bank said amoskeag chairman william transaction opposed group portsmouth investors wanted bank remain independent according press reports reuter +earn,declares stock dividend financial corp said declared stock dividend one class share two class shares held payable may shareholders record april reuter +acq,allegheny ag shareholders file suit allegheny international inc agreed merge jointly formed first boston inc affiliate deal worth mn dlrs said shareholders preferred stock filed class action complaint company complaint alleges among things company board agreed pay first boston illegal seven mln dlr fee received higher offer company prior buyout suit fee allegheny ability attract offers take actions would benefit holders preferred stock complaint also alleges federal securities laws violations fiduciary duty suit requests injunction proceeding pending offer made sunter acquisition acquire allegheny sunter acquisition corp sunter holdings corp formed first boston allegheny allegheny said sunter concerns intend vigorously defend complaint charges complaints filed robert parties believed shares allegheny preferred stock reuter +trade,canadians urge exemption u trade bill group canadian lawmakers ontario today asked u exempt canada mandatory trade retaliation provisions major trade bill considered u congress meeting northeast midwest coalition organization u legislators david cooke chairman ontario parliament select committee economic affairs said exemption would help trade relations trade legislation considered full house late april would require president reagan retaliate foreign unfair trade practices unless trade actions would harm u economy currently reagan reject trade sanctions grounds cooke member liberal party told u congressmen understand trade bill think concerns parts world would suggest best concerns canada consider country bill added canada united states largest trading partner two way trade billion dlrs according coalition u ran billion dlr deficit manufactured goods year compared billion dlr surplus services trade reuter +earn,american corp nd qtr feb shr profit one cts vs loss three cts net profit vs loss revs mln vs mln six months shr profit six cts vs loss six cts net profit mln vs loss mln revs mln vs mln note six months includes gain four cts change accounting principle reuter +earn,meyers co increases dividend qtly div eight cts vs seven cts prior payable may record april reuter diff --git a/tests/data_for_tests/io/R8/test.csv b/tests/data_for_tests/io/R8/test.csv new file mode 100644 index 00000000..13225334 --- /dev/null +++ b/tests/data_for_tests/io/R8/test.csv @@ -0,0 +1,6 @@ +earn,technology inc nd qtr march shr profit eight cts vs loss dlrs net profit vs loss revs mln vs avg shrs vs six mths shr loss nine cts vs loss dlrs net loss vs loss revs mln vs mln avg shrs vs reuter +earn,nacco industries report nd qtr gain nacco industries inc said report gain second quarter mln dlrs dlrs share sale stock subsidiary nacco said north american coal corp unit received notice consolidation coal co unit du pont co dd exercise option buy stock mining co subsidiary north american coal stock north american coal receive mln dlrs mln paid closing april rest company said addition pay dividend north american coal mln dlrs retained earnings closing funds previously used finance mining operations consolidation coal got option group utilities received option nacco nacco reported earnings mln dlrs dlrs share last year second quarter generated mln dlrs net income equal cts share nacco total earnings dlrs share produced mln short tons mln tons produced north american coal nacco said reuter +earn,buffton post investigation charge buffton corp said conduct investigation plant designated site result charge six cts per share second quarter year ago second quarter buffton reported net income cts share dlrs sales mln dlrs study completed nine months determine action may required inc plant former owner split cost buffton said share cost dlrs reuter +acq,american dynamics sell pct stake american dynamics corp meridian reserve inc said signed definitive agreement meridian buy mln shares pct american dynamics common stock terms agreement santa calif based meridian said pay based american dynamics one mln dlrs cash notes five years shares common stock meridian said option issue additional shares common next two years payment certain notes meridian oil gas company whose operations primarily oklahoma said acquisition increase consolidated assets mln dlrs committed gas reserves mln dlrs discounted present value american dynamics engaged gas gathering transmission liquids also oklahoma companies said five plants miles transmission lines five oklahoma counties reuter +money-fx,ussr exchange rates soviet state bank effective april roubles per hundred unless stated u stg unch fin unch yen aus aus dlr unch pak unch ind unch unch one unch unch +earn,republicbank rpt brazil loans republicbank corp said placed mln dlrs intermediate term loans brazil non accrual basis march said reclassification reduce first quarter earnings mln dlrs taxes mln dlrs taxes brazil change position moratorium interest payments republicbank also said net income first quarter expected mln dlrs cts share fully diluted basis year ago first quarter company earned mln dlrs cts share company also said first quarter results expected include provision loan losses mln dlrs mln dlrs net loan charge offs mln dlrs said provision increase loan losses mln dlrs pct loans republicbank total assets billion dlrs announced december agreement interfirst corp form first republicbank corp merger approved regulatory agencies stockholders would create th largest bank holding company united states reuter diff --git a/tests/data_for_tests/io/R8/train.csv b/tests/data_for_tests/io/R8/train.csv new file mode 100644 index 00000000..77897bb9 --- /dev/null +++ b/tests/data_for_tests/io/R8/train.csv @@ -0,0 +1,6 @@ +earn,meyers co year feb shr dlrs vs dlrs net mln dlrs vs mln revs mln vs mln note results reflect year month period company changed fiscal year end february march reuter +earn,kelly oil gas partners year dec shr cts vs cts net mln vs mln revs mln vs mln reuter +money-fx,japan seeks strengthen paris currency accord japan seek strengthen paris accord currency stability meeting group seven leading industrial nations tomorrow japanese officials said however officials japanese finance minister kiichi miyazawa asked identified would provide details wanted accord signed six leading industrial democracies february strengthened currency target zones reference ranges discussed g meeting scheduled tomorrow japanese officials said meeting held conjunction week international monetary fund world bank sessions currency pact need changing language used paris accord officials said miyazawa met u treasury secretary james baker early afternoon discussed dollar yen exchange rates officials said declined disclosed details discussion japanese officials also declined detail miyazawa baker discussed subject greater joint intervention currency markets stabilize dollar independent american intervention officials said money market action stabilize dollar benefit japan suffering sharp appreciation currency also benefit united states well u japan take steps boost domestic demand reduce trade surplus japan explain economic measures g officials said however miyazawa failed outline size japanese economic package meeting baker today japanese budget authorized parliament despite new fiscal year started april one officials said japan ruling liberal democratic party revealed economic package today calling billion yen additional spending reuter +earn,convertible securities sets dividend convertible securities fund inc said board declared initial quarterly dividend three cents per share payable april shareholders record april said anticipates paying regular quarterly dividend company made initial public stock offering march five reuter +money-fx,south korean fixed month high bank korea said fixed dollar highest level since february set yesterday risen pct dollar far year rising pct reuter +ship,australian unions launch new south wales strikes australian trade unions said launched week long strikes industrial action new south wales nsw protest new laws would reduce injury compensation payments union sources said talks state government broke last night two sides scheduled meet later today attempt find compromise rail freight shipping cargo movements country state first affected union officials said almost every business sector hit unless quick settlement state government recently introduced new workers compensation act would cut cash benefits injured workers third act awaiting parliamentary nsw state premier said workers compensation risen recent years proposed cuts would save hundreds dollars year union officials said industrial action could spread states federal government also plans make sharp cuts workers compensation reuter diff --git a/tests/data_for_tests/io/mr/dev.csv b/tests/data_for_tests/io/mr/dev.csv new file mode 100644 index 00000000..a00e0b77 --- /dev/null +++ b/tests/data_for_tests/io/mr/dev.csv @@ -0,0 +1,6 @@ +1,"apesar de seus graves problemas , o filme consegue entreter" +0,"except as an acting exercise or an exceptionally dark joke , you wonder what anyone saw in this film that allowed it to get made" +0,"a real clunker a well made , thoughtful , well acted clunker , but a clunker nonetheless" +0,an ugly duckling tale so hideously and clumsily told it feels accidental +0,"unspeakable , of course , barely begins to describe the plot and its complications vulgar is too optimistic a title" +0,at least moore is a real charmer \ No newline at end of file diff --git a/tests/data_for_tests/io/mr/test.csv b/tests/data_for_tests/io/mr/test.csv new file mode 100644 index 00000000..f3804141 --- /dev/null +++ b/tests/data_for_tests/io/mr/test.csv @@ -0,0 +1,6 @@ +1,the animated sequences are well done and perfectly constructed to convey a sense of childhood imagination and creating adventure out of angst +1,a great companion piece to other napoleon films +1,spellbinding fun and deliciously exploitative +0,an ugly duckling tale so hideously and clumsily told it feels accidental +0,"unspeakable , of course , barely begins to describe the plot and its complications vulgar is too optimistic a title" +0,at least moore is a real charmer \ No newline at end of file diff --git a/tests/data_for_tests/io/mr/train.csv b/tests/data_for_tests/io/mr/train.csv new file mode 100644 index 00000000..82c01beb --- /dev/null +++ b/tests/data_for_tests/io/mr/train.csv @@ -0,0 +1,6 @@ +1,"'moore is like a progressive bull in a china shop , a provocateur crashing into ideas and special interest groups as he slaps together his own brand of liberalism '" +1,idiotic and ugly +1,"even if the naipaul original remains the real masterpiece , the movie possesses its own languorous charm" +1,"the movie is amateurish , but it 's a minor treat" +1,"some people march to the beat of a different drum , and if you ever wondered what kind of houses those people live in , this documentary takes a look at 5 alternative housing options" +1,the movie plays up the cartoon 's more obvious strength of snazziness while neglecting its less conspicuous writing strength diff --git a/tests/data_for_tests/io/ohsumed/dev.csv b/tests/data_for_tests/io/ohsumed/dev.csv new file mode 100644 index 00000000..7a26fb04 --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/dev.csv @@ -0,0 +1,6 @@ +C23,"assessment biliary tract liver transplantation tube cholangiography iodida scanning biliary tract obstruction anastomotic leakage common problems following liver transplantation sequential study , 31 patients liver transplant investigated 99mtc iodida \( iodida \) scanning tube cholangiography \( ttc \) results compared clinical outcome seven patients extrahepatic biliary obstruction one patient biliary leak detection biliary complications ttc iodida scanning similar terms sensitivity \( 63 per cent \) ttc better specificity \( 79 per cent versus 60 per cent \) accuracy \( 74 per cent versus 60 per cent \) iodida scanning liver function taken account , diagnostic efficacy tests patients bilirubin levels less 200 mumol l similar levels greater 200 mumol l greater number false positive results iodida scanning \( 12 per cent versus 54 per cent \) significant biliary leak clearly detected ttc iodida scanning ttc remains effective way evaluating biliary tract transplantation iodida scanning limited value bilirubin levels elevated , may provide additional information blood supply , hepatocyte function intrahepatic cholestasis" +C23,"patterns dyspepsia patients clinical evidence organic diseases studied 2000 dyspeptic patients obvious signs organic disease first examination , order \( 1 \) verify many diagnoses idiopathic dyspepsia really made diagnostic procedures \( 2 \) evaluate diagnostic power symptoms distinguishing organic idiopathic dyspepsia latter considered structural abnormalities found cases , distinction made related associated organic dyspepsia according whether certain relationship abnormalities dyspeptic symptoms patients referred us follows \( 1 \) spontaneously , \( 2 \) sent physicians us , \( 3 \) referred open access endoscopic service results show frequency idiopathic dyspepsia 26 , whereas associated structural abnormalities present 45 4 obvious organic causes dyspepsia seen 28 6 \( 24 benign 4 6 malignant diseases \) considered separately , symptom alone allows correct diagnosis simultaneous evaluation symptoms linear discriminant analysis distinguishes idiopathic organic dyspeptic patients 70 cases higher discrimination percentage 70 cases higher discrimination percentage could probably obtained using wider range clinical parameters complex statistical analysis interrelationships exist clinical symptoms final diagnosis" +C23,"evaluation 13c urea breath test detection helicobacter pylori monitoring effect non ulcer dyspepsia sixty nine patients non ulcer dyspepsia studied endoscopy , biopsy , quick urease \( \) test , helicobacter pylori culture , 13c urea breath test treatment \( \) two tablets twice daily four weeks symptoms non ulcer dyspepsia recorded using standard questionnaire using h pylori culture gold standard , sensitivity 13c urea breath test 90 , specificity 98 6 , accuracy 94 8 positive predictive value 98 2 negative predictive value 92 5 conversion rate h pylori positive negative status treatment 17 9 symptoms non ulcer dyspepsia improved appreciably treatment irrespective h pylori status 13c urea breath test accurate research tool suitable serial testing population surveys" +C23,"demonstration area slow conduction human atrial flutter ten patients chronic atrial flutter studied prospectively using electrophysiologic mapping pacing techniques assess mechanism atrial flutter presence area slow conduction atria electrograms recorded greater equal 30 right atrial sites patient atrial flutter demonstrated right atrial free wall activation interatrial septum activation , consistent reentrant circuit involving right atrium six patients , slow conduction occurred atrial flutter inferior right atrium spatially associated fractionated recordings four patients , missing interval electrical activity occurred inferior right atrium average 40 atrial flutter cycle transient criteria demonstrated patient rapid high right atrial pacing mean activation time high right atrial pacing site coronary sinus \( inferior left atrial \) recording site long \( 228 ms \) consistent activation area slow conduction rapid pacing atrial flutter coronary sinus site , transient criteria could demonstrated mean activation time coronary sinus pacing site high right atrial recording site relatively short \( 134 ms \) consistent activation high right atrium area slow conduction high right atrial pacing sinus rhythm rates similar atrial flutter demonstrated short activation time coronary sinus low right atrial sites \( mean 169 88 ms , respectively \) , indicating activation area slow conduction coronary sinus pacing sinus rhythm demonstrated phenomena low right atrial electrograms recorded sinus rhythm rapid pacing sinus rhythm fractionated , although atrial flutter thus , atrial mapping pacing data complementary , indicating human atrial flutter patients studied generated reentrant circuit right atrium , area slow conduction low right atrium present atrial flutter" +C23,"analysis base station morphine orders assessment physician consistency paramedic contact base station consistent recommendations reflecting consensus base station physician care urban ems system , paramedics must contact single base station provide morphine sulfate \( ms \) patient chest pain performed retrospective cohort analysis prehospital ms requests chest pain determine consistency circumstances paramedic team refused ms ms requests represented 123 1 , \( 7 \) line physician consultations 6 month study 15 123 \( 12 \) ms requests refused neither mean patient age , sex distribution , presenting vital signs correlated ms refusal maximum estimate transport time hospital less equal 5 minutes noted 7 15 \( 47 \) medication compared 11 96 \( 11 \) documented estimated transport times \( p less equal 0 005 \) simultaneous request nitroglycerin \( \) noted 6 15 \( 40 \) medication 15 108 \( 14 \) \( p less 0 05 \) found refusal ms administration uncommon physicians tended ms transport time short requested concomitant administration also noted physician inconsistencies refusal findings guide physician consensus development avoid mixed paramedics" +C23,"predictors smoking nhanes followup experience published prospective studies predictors spontaneously cigarette smoking nationally representative u population paper describes study , using cohort taken first national health nutrition examination survey \( nhanes , 1971 1975 \) traced nhanes epidemiologic followup survey \( 1982 1984 \) successful \( least 1 year time followup \) ascertained among adults \( age 25 74 years \) smokers time nhanes disabled followup independent predictors \( proportional hazards multiple regression \) \( 1 \) older age \( 2 \) white race \( 3 \) fewer cigarettes smoked day \( 4 \) higher household income \( 5 \) hospitalization followup period predictors relapse \( ex smokers nhanes smoking time followup \) \( 1 \) younger age \( 2 \) urban residence \( 3 \) female gender findings implications intervention strategies , public health projections research" diff --git a/tests/data_for_tests/io/ohsumed/test.csv b/tests/data_for_tests/io/ohsumed/test.csv new file mode 100644 index 00000000..553af66a --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/test.csv @@ -0,0 +1,6 @@ +C23,"development small caliber biologic vascular graft evaluation antithrombogenicity early healing process authors previously showed small caliber xenograft using crosslinking technique applicable aortocoronary bypass grafting study graft , antithrombogenicity healing process evaluated early stage implantation fresh sheep carotid artery \( id \) obtained cross linked compounds , used small caliber vascular graft graft white soft six cm segments graft implanted carotid arteries bilaterally nine dogs sodium heparin given surgery , anticoagulant used postoperatively fifteen grafts eight dogs removed 1 hr 30 days implantation , 13 15 grafts found patent two grafts , one 3 days , 14 days , occluded anastomotic area occluded grafts felt hard outside one dog , grafts shown angiographically patent 14 days implantation , dog kept long term observation macroscopically , thrombus observed patent grafts microscopically , inner surface near anastomotic lines covered endothelial cells , infiltration fibroblasts observed outside 7 days implantation foreign body reactions seen around graft 30 days implantation , thin layer plasma protein middle graft observed scanning electron microscopy \( sem \) observations , concluded grafts exhibited satisfactory early antithrombogenicity healing implantation" +C23,"proliferation substrate effects endothelial cell thrombogenicity effects cellular differentiation status adhesive substrate endothelial cell function cell culture measured enzyme based assay surface thrombogenicity solid plastic , microporous polymeric , fibronectin \( fn \) treated microporous polymeric used substrates growth endothelial cells microporous fn treated synthetic substrates shown aid induction cellular differentiation mechanisms cells studied proliferative growth conditions thrombogenicity surface created endothelial cell monolayers various experimental conditions determined using enzyme based assay fibrin deposition actively proliferating cells solid plastic substrate produced thrombogenic surface , confluent endothelial cell monolayers grown fn treated microporous substrate least thrombogenic surfaces data suggest endothelial cell surface thrombogenicity substrate control , also related cellular differentiation status findings used design novel approach small diameter synthetic vascular graft problem" +C23,"effect complement arachidonic acid pathway inhibition white blood cell count deposition vascular grafts determine role complement arachidonic acid metabolites decrease peripheral white blood cell count \( pwbc \) observed graft implantation , dacron aortic grafts implanted control rabbits \( group , n 13 \) , rabbits pretreated venom factor \( 80 u kg \) complement \( group ii , n 13 \) , indomethacin \( 2 5 mg kg \) inhibit cyclooxygenase \( group iii , n 7 \) , diethylcarbamazine \( dec , 90 mg kg \) inhibit leukotriene synthesis \( group iv , n 7 \) pwbc measured 15 min 1 hr graft implantation graft removal , wbc count grafts \( gwbc \) determined light microscopy \( \) scanning electron microscopy \( sem \) one hr graft implantation , pwbc decreased significantly groups iv 46 , 52 , 40 , 45 preoperative pwbc , respectively significant difference among groups revealed gwbc per field 8 0 , 12 3 , 5 8 , 6 8 groups iv , respectively similarly , sem showed gwbc per field 2 5 , 5 6 , 0 7 , 1 5 groups iv , respectively sem gwbc significantly greater group ii \( p less 0 01 \) , significantly less group iii \( p less 0 05 \) results suggested complement arachidonic acid pathways alone affect fall pwbc , may influence gwbc" +C23,"total perinatal wastage clarification priorities pregnancy outcome 16 , women carrying 17 , living fetuses 16 weeks gestation studied well recording perinatal deaths , losses 28 weeks one year delivery recorded give total perinatal wastage rate 21 6 per 1000 fetuses alive 16 weeks compared perinatal mortality rate \( plus early neonatal deaths \) 7 8 per 1000 births deaths classified according pathological sub groups concept perinatal care using perinatal mortality compared using total perinatal wastage" +C23,"magnetic resonance imaging idiopathic retroperitoneal fibrosis measurement t1 relaxation time magnetic resonance imaging 0 08 performed nine patients proven idiopathic retroperitoneal fibrosis total 11 scans performed three patients scanned diagnosis one also two follow scans six patients scanned variable time diagnosis treatment scan , soft tissue mass readily identified , distribution corresponding seen computed tomography difference mean t1 relaxation time mass patients scanned diagnosis scanned treatment however , patient followed serial scans showed progressive reduction t1 value mass time comparison results obtained patients lymphoma suggests t1 values retroperitoneal fibrosis lower lymphoma , particularly non hodgkin 's lymphoma" +C23,"development reversibility lymphocyte dysfunction experimental obstructive jaundice study evaluates effect experimental biliary obstruction bile duct ligation \( \) biliary drainage cell mediated immunity wistar rats immune status assessed mitogen stimulation test lymphocytes animals followed 35 days regression analysis showed significant negative correlation lymphocyte function period jaundice \( correlation coefficient 0 57 , p less 0 001 \) following 21 days , groups animals internal biliary drainage 7 , 14 28 days , external drainage 14 days compared obstructed animals , 14 days internal drainage required improve lymphocyte function \( p less 0 05 \) animals 14 days external drainage significantly lower lymphocyte stimulation internal drainage animals \( p less 0 05 \) results demonstrate obstructive jaundice produces progressive reduction lymphocyte function reversed biliary drainage , internal drainage effective external drainage" diff --git a/tests/data_for_tests/io/ohsumed/train.csv b/tests/data_for_tests/io/ohsumed/train.csv new file mode 100644 index 00000000..7a6cfba7 --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/train.csv @@ -0,0 +1,6 @@ +C23,"role membrane proteins monosodium urate crystal membrane interactions ii effect erythrocyte membranes membrane permeable impermeable protein crosslinking agents intact , human erythrocytes pretreated membrane permeable , dimethyl \( \) dimethyl \( \) membrane impermeable 3 , 3' \( \) \( \) protein crosslinking agents incubated monosodium urate monohydrate \( \) crystals percent inhibition lysis values pretreated cells relative untreated cells determined 3 agents caused concentration dependent inhibition induced hemolysis due decrease binding pretreated membranes proposed inhibition lysis due crosslinking integral cytoskeletal membrane proteins , resulting reduced mobility proteins , inhibition integral proteins aggregates decreased pore formation membrane" +C23,"biliary gut function following shock aim study characterize alterations gallbladder intestinal function hemorrhagic shock blood reperfusion animals subjected shock 30 mm hg arterial blood pressure 60 minutes resuscitated blood reinfusion gallbladder epithelial ion transport , gallbladder motility vitro vivo , gastrointestinal motility , flora stomach small bowel studied 2 24 hours shock changes 2 hours included decreased gallbladder contractility vitro decreased emptying vivo , loss coordination intestinal motor activity , decrease frequency intestinal electrical slow waves , reduced duration intestinal migrating motor complex cycle 24 hours , gallbladder epithelial permeability increased vitro contractility remained reduced vivo functions showed partial recovery gastrointestinal flora affected changes data demonstrate hemorrhagic shock reperfusion affect digestive motility early timing alterations observed partial recovery 24 hours post shock suggest ischemia hypoxia mechanism injury" +C23,"short term long term changes renal function donor nephrectomy retrospectively examined effect nephrectomy renal function 55 living related donors renal function measured scans patients studied preoperatively , 1 week 1 year postoperatively 20 patients 10 year followup available compensatory hypertrophy complete 1 week postoperatively effective renal plasma flow remaining kidney 32 5 higher preoperatively increase remained stable least year degree compensatory hypertrophy significantly greater male patients \( 46 9 1 week \) female patients \( 26 7 \) compensatory hypertrophy occurred age groups studied pronounced patients less 30 years old patients followed 10 years effective renal plasma flow decreased 387 7 ml per minute 1 week nephrectomy 4 ml per minute 10 years result similar decrease seen normal population according results , renal donation living related persons lead long term decrease renal function" +C23,treatment idiopathic retroperitoneal fibrosis immunosuppression idiopathic retroperitoneal fibrosis exceedingly uncommon childhood etiology uncertain support immunological basis disease given report 14 year old girl severe retroperitoneal fibrosis causing progressive azathioprine prednisolone used successfully case supports efficacy immunotherapy treatment idiopathic retroperitoneal fibrosis +C23,en bloc transplantation kidneys donors weighing less 15 kg adult recipients en bloc transplantation kidneys donors weighed less 15 kg 20 adult patients described medial kidney allowed adequate renal positioning growth graft venous thrombosis occurred 1 patient irreversible graft rejection occurred 4 patients graft survival 65 excellent function mean followup 8 8 months en bloc transplantation pediatric cadaver kidney grafts adults acceptable procedure +C23,"afferent nipple valve malfunction caused anchoring collar unexpected late complication kock continent ileal reservoir construction kock continent ileal reservoir urinary diversion , significantly high rates late postoperative complications regarding nipple valves , efferent limb particular , reported reports afferent nipple valve malfunction total 42 patients underwent kock pouch operation observed 12 months \( mean 38 months \) evaluated terms afferent nipple valve malfunction late afferent nipple valve complications observed 10 42 patients \( 24 \) complications included erosion fiber fabric used collar \( 5 patients \) , stenosis afferent limb \( 2 \) obstruction afferent nipple mucous plug fungus ball \( 3 \) latter 2 complications due mechanical dynamic obstruction urine flow caused nonabsorbable collar none 10 patients problems efferent nipple valve function results suggest peristaltic direction intestine use nonabsorbable material collar primarily responsible late afferent nipple valve complications modifications needed produce stable nipple valve otherwise , simpler reliable alternative techniques anastomosis considered" diff --git a/tests/io/loader/test_classification_loader.py b/tests/io/loader/test_classification_loader.py index 836e24e4..02c58e80 100644 --- a/tests/io/loader/test_classification_loader.py +++ b/tests/io/loader/test_classification_loader.py @@ -5,7 +5,8 @@ import os from fastNLP.io import DataBundle from fastNLP.io.loader.classification import YelpFullLoader, YelpPolarityLoader, IMDBLoader, \ - SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader + SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, \ + MRLoader, R8Loader, R52Loader, OhsumedLoader, NG20Loader @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @@ -31,6 +32,11 @@ class TestLoad(unittest.TestCase): 'ChnSentiCorp': ('tests/data_for_tests/io/ChnSentiCorp', ChnSentiCorpLoader, (6, 6, 6), False), 'THUCNews': ('tests/data_for_tests/io/THUCNews', THUCNewsLoader, (9, 9, 9), False), 'WeiboSenti100k': ('tests/data_for_tests/io/WeiboSenti100k', WeiboSenti100kLoader, (6, 7, 6), False), + 'mr': ('tests/data_for_tests/io/mr', MRLoader, (6, 6, 6), False), + 'R8': ('tests/data_for_tests/io/R8', R8Loader, (6, 6, 6), False), + 'R52': ('tests/data_for_tests/io/R52', R52Loader, (6, 6, 6), False), + 'ohsumed': ('tests/data_for_tests/io/R52', OhsumedLoader, (6, 6, 6), False), + '20ng': ('tests/data_for_tests/io/R52', NG20Loader, (6, 6, 6), False), } for k, v in data_set_dict.items(): path, loader, data_set, warns = v