| @@ -0,0 +1,108 @@ | |||
| #!/usr/bin/env python | |||
| # | |||
| # A format checker for LIBSVM | |||
| # | |||
| # | |||
| # Copyright (c) 2007, Rong-En Fan | |||
| # | |||
| # All rights reserved. | |||
| # | |||
| # This program is distributed under the same license of the LIBSVM package. | |||
| # | |||
| from sys import argv, exit | |||
| import os.path | |||
| def err(line_no, msg): | |||
| print("line {0}: {1}".format(line_no, msg)) | |||
| # works like float() but does not accept nan and inf | |||
| def my_float(x): | |||
| if x.lower().find("nan") != -1 or x.lower().find("inf") != -1: | |||
| raise ValueError | |||
| return float(x) | |||
| def main(): | |||
| if len(argv) != 2: | |||
| print("Usage: {0} dataset".format(argv[0])) | |||
| exit(1) | |||
| dataset = argv[1] | |||
| if not os.path.exists(dataset): | |||
| print("dataset {0} not found".format(dataset)) | |||
| exit(1) | |||
| line_no = 1 | |||
| error_line_count = 0 | |||
| for line in open(dataset, 'r'): | |||
| line_error = False | |||
| # each line must end with a newline character | |||
| if line[-1] != '\n': | |||
| err(line_no, "missing a newline character in the end") | |||
| line_error = True | |||
| nodes = line.split() | |||
| # check label | |||
| try: | |||
| label = nodes.pop(0) | |||
| if label.find(',') != -1: | |||
| # multi-label format | |||
| try: | |||
| for l in label.split(','): | |||
| l = my_float(l) | |||
| except: | |||
| err(line_no, "label {0} is not a valid multi-label form".format(label)) | |||
| line_error = True | |||
| else: | |||
| try: | |||
| label = my_float(label) | |||
| except: | |||
| err(line_no, "label {0} is not a number".format(label)) | |||
| line_error = True | |||
| except: | |||
| err(line_no, "missing label, perhaps an empty line?") | |||
| line_error = True | |||
| # check features | |||
| prev_index = -1 | |||
| for i in range(len(nodes)): | |||
| try: | |||
| (index, value) = nodes[i].split(':') | |||
| index = int(index) | |||
| value = my_float(value) | |||
| # precomputed kernel's index starts from 0 and LIBSVM | |||
| # checks it. Hence, don't treat index 0 as an error. | |||
| if index < 0: | |||
| err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i])) | |||
| line_error = True | |||
| elif index <= prev_index: | |||
| err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i])) | |||
| line_error = True | |||
| prev_index = index | |||
| except: | |||
| err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i])) | |||
| line_error = True | |||
| line_no += 1 | |||
| if line_error: | |||
| error_line_count += 1 | |||
| if error_line_count > 0: | |||
| print("Found {0} lines with error.".format(error_line_count)) | |||
| return 1 | |||
| else: | |||
| print("No error.") | |||
| return 0 | |||
| if __name__ == "__main__": | |||
| exit(main()) | |||