リビジョン | 156a6dcf855a5b4bc4e16d406bf02debcc02cc45 (tree) |
---|---|
日時 | 2020-01-27 04:41:23 |
作者 | Dreas Nielsen <dreas.nielsen@gmai...> |
コミッター | Dreas Nielsen |
Added option to check the order of columns in the CSV file.
@@ -39,12 +39,14 @@ | ||
39 | 39 | # 2011-09-25 First version. Version 0.8.0.0. RDN. |
40 | 40 | # 2018-10-27 Converted to run under both Python 2 and 3. Version 1.0.0. RDN. |
41 | 41 | # 2019-01-02 Corrected handling of next() for csv library. Version 1.0.1. RDN. |
42 | -# 2018-01-04 Added check for data rows with more columns than column headers. | |
42 | +# 2019-01-04 Added check for data rows with more columns than column headers. | |
43 | 43 | # Version 1.1.0. RDN. |
44 | +# 2020-01-26 Added an option to check that the order of columns in the CSV | |
45 | +# file is the same as in the specifications. RDN. | |
44 | 46 | # ============================================================================ |
45 | 47 | |
46 | -_version = "1.1.0" | |
47 | -_vdate = "2019-01-04" | |
48 | +_version = "1.2.0" | |
49 | +_vdate = "2020-01-26" | |
48 | 50 | |
49 | 51 | import sys |
50 | 52 | from optparse import OptionParser |
@@ -271,11 +273,12 @@ | ||
271 | 273 | def dispatch(self, check_funcs, data): |
272 | 274 | errlist = [ f(data) for f in check_funcs ] |
273 | 275 | return [ e for e in errlist if e ] |
274 | - def __init__(self, fmt_spec, colname, column_required_default, data_required_default): | |
276 | + def __init__(self, fmt_spec, colname, column_required_default, data_required_default, column_position): | |
275 | 277 | self.name = colname |
276 | 278 | self.data_required = data_required_default |
277 | 279 | # By default, all columns are required unless there is a specification indicating that it is not. |
278 | 280 | self.column_required = column_required_default |
281 | + self.column_position = column_position | |
279 | 282 | specs = fmt_spec.options(colname) |
280 | 283 | # Get the value for each option, using an appropriate function for each expected value type. |
281 | 284 | for spec in specs: |
@@ -350,6 +353,8 @@ | ||
350 | 353 | parser.add_option("-l", "--linelength", action="store_false", dest="linelength", |
351 | 354 | default=True, |
352 | 355 | help="Allow rows of the CSV file to have fewer columns than in the column headers. The default is to report an error for short data rows. If short data rows are allowed, any row without enough columns to match the format specification will still be reported as an error.") |
356 | + parser.add_option("-p", "--position", action="store_true", dest="position", default=False, | |
357 | + help="Position (order) of columns in the CSV file must match that in the specification.") | |
353 | 358 | parser.add_option("-i", "--case-insensitive", action="store_true", dest="caseinsensitive", |
354 | 359 | default=False, |
355 | 360 | help="Case-insensitive matching of column names in the format configuration file and the CSV file. The default is case-sensitive (i.e., column names must match exactly).") |
@@ -420,6 +425,7 @@ | ||
420 | 425 | :param column_required: Whether or not the column must be in the CSV file to be checked. |
421 | 426 | :param data_required: Whether or not a data value is required on every row of the CSV file. |
422 | 427 | :param chkopts: The name of a section in the format specification file containing additional options. |
428 | + :rtype: A dictionary of column type checking functions, indexed by column name. | |
423 | 429 | """ |
424 | 430 | fmtspecs = ConfigParser() |
425 | 431 | try: |
@@ -431,13 +437,13 @@ | ||
431 | 437 | # Convert ConfigParser object into a list of CsvChecker objects |
432 | 438 | speccols = [ sect for sect in fmtspecs.sections() if sect != chkopts ] |
433 | 439 | cols = {} |
434 | - for col in speccols: | |
435 | - cols[col] = CsvChecker(fmtspecs, col, column_required, data_required) | |
440 | + for i, col in enumerate(speccols): | |
441 | + cols[col] = CsvChecker(fmtspecs, col, column_required, data_required, i) | |
436 | 442 | return cols |
437 | 443 | |
438 | 444 | |
439 | 445 | def check_csv_file(csv_fname, cols, halt_on_err, columnexit, \ |
440 | - linelength, caseinsensitive, encoding=None): | |
446 | + linelength, caseinsensitive, encoding=None, match_position=False): | |
441 | 447 | """Check that all of the required columns and data are present in the CSV file, and that |
442 | 448 | the data conform to the appropriate type and other specifications. |
443 | 449 |
@@ -448,6 +454,8 @@ | ||
448 | 454 | :param linelength: Whether to report an error if any data row has a different number of items than indicated by the column headers. |
449 | 455 | :param casesensitive: Whether column names in the specifications and CSV file should be compared case-insensitively. |
450 | 456 | :param encoding: The character encoding of the CSV file. |
457 | + :param match_position: Whether or not the position (order) of columns in the CSV file must match that in the specifications. | |
458 | + :rtype: A list of error messages as strings. | |
451 | 459 | """ |
452 | 460 | errorlist = [] |
453 | 461 | dialect = csv.Sniffer().sniff(open(csv_fname, "rt").readline()) |
@@ -467,8 +475,8 @@ | ||
467 | 475 | if len(req_missing) > 0: |
468 | 476 | errorlist.append(("The following columns are required, but are not present in the CSV file: %s." % ", ".join(req_missing), csv_fname, 1)) |
469 | 477 | return errorlist |
470 | - # Exit if there are extra columns and the option to exit is set. | |
471 | - if columnexit: | |
478 | + # Exit if there are extra columns and either the option to exit is set or the column positions must match. | |
479 | + if columnexit or match_position: | |
472 | 480 | if caseinsensitive: |
473 | 481 | speccols_l = [ c.lower() for c in cols ] |
474 | 482 | extra = [ col for col in colnames if not (col.lower() in speccols_l) ] |
@@ -477,6 +485,24 @@ | ||
477 | 485 | if len(extra) > 0: |
478 | 486 | errorlist.append(("The following columns have no format specifications but are in the CSV file: %s." % u", ".join(extra), csv_fname, 1)) |
479 | 487 | return errorlist |
488 | + # Report an error if the position (order) of columns is required to be the same and it is not. | |
489 | + if match_position: | |
490 | + spec_col_order = [c[0] for c in sorted([(c[1].name, c[1].column_position) for c in cols.items()], key=lambda p: p[1])] | |
491 | + same_order = True | |
492 | + if caseinsensitive: | |
493 | + colnames_l = [ c.lower() for c in colnames ] | |
494 | + specnames_l = [ c.lower() for c in spec_col_order ] | |
495 | + for i, cname in enumerate(colnames_l): | |
496 | + if specnames_l[i] != cname: | |
497 | + same_order = False | |
498 | + break | |
499 | + else: | |
500 | + for i, cname in enumerate(colnames): | |
501 | + if spec_col_order[i] != cname: | |
502 | + same_order = False | |
503 | + break | |
504 | + if not same_order: | |
505 | + errorlist.append(("The order of columns in the CSV file is not the same as in the specifications", csv_fname, 1)) | |
480 | 506 | # Column names common to specifications and data file. These will be used |
481 | 507 | # to index the cols dictionary to get the appropriate check method |
482 | 508 | # and to index the CSV column name list (colnames) to get the column position. |
@@ -549,7 +575,7 @@ | ||
549 | 575 | cols = read_format_specs(fmt_file, opts.column_required, opts.data_required, chkopts) |
550 | 576 | # Check the file |
551 | 577 | errorlist = check_csv_file(csv_file, cols, opts.haltonerror, |
552 | - opts.columnexit, opts.linelength, opts.caseinsensitive, opts.encoding) | |
578 | + opts.columnexit, opts.linelength, opts.caseinsensitive, opts.encoding, opts.position) | |
553 | 579 | if len(errorlist) > 0: |
554 | 580 | show_errors(errorlist) |
555 | 581 | return 1 |
@@ -48,7 +48,7 @@ | ||
48 | 48 | |
49 | 49 | # General information about the project. |
50 | 50 | project = u'chkcsv' |
51 | -copyright = u'2011-2019, Dreas Nielsen' | |
51 | +copyright = u'2011-2020, Dreas Nielsen' | |
52 | 52 | author = u'Dreas Nielsen' |
53 | 53 | |
54 | 54 | # The version info for the project you're documenting, acts as replacement for |
@@ -56,9 +56,9 @@ | ||
56 | 56 | # built documents. |
57 | 57 | # |
58 | 58 | # The short X.Y version. |
59 | -version = u'1.1' | |
59 | +version = u'1.2' | |
60 | 60 | # The full version, including alpha/beta/rc tags. |
61 | -release = u'1.1.0' | |
61 | +release = u'1.2.0' | |
62 | 62 | |
63 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation |
64 | 64 | # for a list of supported languages. |
@@ -6,7 +6,7 @@ | ||
6 | 6 | ``chkcsv.py`` is a Python module and program that checks the format |
7 | 7 | and content of a comma-separated-value (CSV) or similar delimited text |
8 | 8 | file. It can check whether required columns are present, and the type, |
9 | -length, and pattern of each column. | |
9 | +length, pattern, and order of each column. | |
10 | 10 | |
11 | 11 | |
12 | 12 | Syntax and Options |
@@ -66,6 +66,8 @@ | ||
66 | 66 | data rows are allowed, any row without enough |
67 | 67 | columns to match the format specification will |
68 | 68 | still be reported as an error. |
69 | + -p, -- position Require that the position (order) of columns in the | |
70 | + CSV file match that in the specifications. | |
69 | 71 | -i, --case-insensitive |
70 | 72 | Case-insensitive matching of column names in |
71 | 73 | the format configuration file and the CSV file. |
@@ -332,7 +334,7 @@ | ||
332 | 334 | Copyright and License |
333 | 335 | ================================ |
334 | 336 | |
335 | -Copyright (c) 2011-2019, R.Dreas Nielsen | |
337 | +Copyright (c) 2011-2020, R.Dreas Nielsen | |
336 | 338 | |
337 | 339 | This program is free software: you can redistribute it and/or modify it |
338 | 340 | under the terms of the GNU General Public License as published by the |
@@ -1,7 +1,7 @@ | ||
1 | 1 | from distutils.core import setup |
2 | 2 | |
3 | 3 | setup(name='chkcsv', |
4 | - version='1.1.0', | |
4 | + version='1.2.0', | |
5 | 5 | description="Checks the format of a CSV file with respect to a specifed set of column names and types.", |
6 | 6 | author='Dreas Nielsen', |
7 | 7 | author_email='dreas.nielsen@gmail.com', |
@@ -26,8 +26,9 @@ | ||
26 | 26 | ], |
27 | 27 | long_description="""``chkcsv.py`` is a Python module and program |
28 | 28 | that checks the format of data in a CSV file. It can check whether required |
29 | -columns and data are present, and the type of data in each column. Pattern | |
30 | -matching using regular expressions is supported. | |
29 | +columns and data are present, check whether the type of data in each column | |
30 | +matches the specifications, and check whether columns are in a specified | |
31 | +order. Pattern matching using regular expressions is supported. | |
31 | 32 | |
32 | 33 | Complete documentation is at http://chkcsv.osdn.io/.""" |
33 | 34 | ) |