| @@ -0,0 +1,108 @@ | |||||
| #!/usr/bin/env python | |||||
| # | |||||
| # A format checker for LIBSVM | |||||
| # | |||||
| # | |||||
| # Copyright (c) 2007, Rong-En Fan | |||||
| # | |||||
| # All rights reserved. | |||||
| # | |||||
| # This program is distributed under the same license of the LIBSVM package. | |||||
| # | |||||
| from sys import argv, exit | |||||
| import os.path | |||||
| def err(line_no, msg): | |||||
| print("line {0}: {1}".format(line_no, msg)) | |||||
| # works like float() but does not accept nan and inf | |||||
| def my_float(x): | |||||
| if x.lower().find("nan") != -1 or x.lower().find("inf") != -1: | |||||
| raise ValueError | |||||
| return float(x) | |||||
| def main(): | |||||
| if len(argv) != 2: | |||||
| print("Usage: {0} dataset".format(argv[0])) | |||||
| exit(1) | |||||
| dataset = argv[1] | |||||
| if not os.path.exists(dataset): | |||||
| print("dataset {0} not found".format(dataset)) | |||||
| exit(1) | |||||
| line_no = 1 | |||||
| error_line_count = 0 | |||||
| for line in open(dataset, 'r'): | |||||
| line_error = False | |||||
| # each line must end with a newline character | |||||
| if line[-1] != '\n': | |||||
| err(line_no, "missing a newline character in the end") | |||||
| line_error = True | |||||
| nodes = line.split() | |||||
| # check label | |||||
| try: | |||||
| label = nodes.pop(0) | |||||
| if label.find(',') != -1: | |||||
| # multi-label format | |||||
| try: | |||||
| for l in label.split(','): | |||||
| l = my_float(l) | |||||
| except: | |||||
| err(line_no, "label {0} is not a valid multi-label form".format(label)) | |||||
| line_error = True | |||||
| else: | |||||
| try: | |||||
| label = my_float(label) | |||||
| except: | |||||
| err(line_no, "label {0} is not a number".format(label)) | |||||
| line_error = True | |||||
| except: | |||||
| err(line_no, "missing label, perhaps an empty line?") | |||||
| line_error = True | |||||
| # check features | |||||
| prev_index = -1 | |||||
| for i in range(len(nodes)): | |||||
| try: | |||||
| (index, value) = nodes[i].split(':') | |||||
| index = int(index) | |||||
| value = my_float(value) | |||||
| # precomputed kernel's index starts from 0 and LIBSVM | |||||
| # checks it. Hence, don't treat index 0 as an error. | |||||
| if index < 0: | |||||
| err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i])) | |||||
| line_error = True | |||||
| elif index <= prev_index: | |||||
| err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i])) | |||||
| line_error = True | |||||
| prev_index = index | |||||
| except: | |||||
| err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i])) | |||||
| line_error = True | |||||
| line_no += 1 | |||||
| if line_error: | |||||
| error_line_count += 1 | |||||
| if error_line_count > 0: | |||||
| print("Found {0} lines with error.".format(error_line_count)) | |||||
| return 1 | |||||
| else: | |||||
| print("No error.") | |||||
| return 0 | |||||
| if __name__ == "__main__": | |||||
| exit(main()) | |||||