From a209c8923332c5a5b28db305ed212ec1825dddbb Mon Sep 17 00:00:00 2001 From: Jonas 'Sortie' Termansen Date: Sun, 8 Apr 2018 01:43:27 +0200 Subject: [PATCH] Fix POSIX comformance issues in sort(1). Fix -C disabling checking rather than checking quietly. Fix sort(1) exiting 1 on certain errors, as POSIX requires sort(1) to only exit if the input wasn't sorted when -c. Fix -o opening the output file for truncation before all the input has been read, as POSIX requires allowing -o to be an input file. POSIX requires sort(1) to handle input errors by either erroring with no output, or by erroring and sorting the input read so far. Change the current behavior of continuing to the next file to simply failing hard on the first input error. Don't increment the last line number on the end of the standard input. Report -c/-C as incompatible with -o. Exit unsuccessfully on any output errors. Update to current coding conventions and add documentation while here. --- utils/Makefile | 1 + utils/sort.1 | 154 +++++++++++++++++++++++++++++++++++ utils/sort.c | 215 +++++++++++-------------------------------------- 3 files changed, 204 insertions(+), 166 deletions(-) create mode 100644 utils/sort.1 diff --git a/utils/Makefile b/utils/Makefile index 6c550e62..a6add9c2 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -82,6 +82,7 @@ memstat.1 \ pager.1 \ passwd.1 \ readlink.1 \ +sort.1 \ uname.1 \ SBINS=\ diff --git a/utils/sort.1 b/utils/sort.1 new file mode 100644 index 00000000..752d6208 --- /dev/null +++ b/utils/sort.1 @@ -0,0 +1,154 @@ +.Dd April 8, 2018 +.Dt SORT 1 +.Os +.Sh NAME +.Nm sort +.Nd sort lines of text +.Sh SYNOPSIS +.Nm +.Op Fl CcmruVz +.Op Fl o Ar path +.Ar +.Sh DESCRIPTION +.Nm +reads lines of text from the standard input and writes the lines in sorted order +to the standard output. +If files are specified, the input is the concatenated content of the files read +in sequential order. +The +.Ar file +path can be set to +.Sq - +to specify the standard input. +The lines are compared according to the current locale's collating rules. +.Pp +The options are as follows: +.Bl -tag -width "12345678" +.It Fl c, \-check, \-check=diagnose-first +Check whether the input is already sorted. +If a line is out of order (or an equal line is found if +.Fl u ) , +write an error describing which line was out of +order and exit 1. +.It Fl C, \-check=quiet, \-check=silent +Same as +.Fl c , +but write no error to the standard output about the input being out order. +.It Fl m, \-merge +Merge the presorted input files into a sorted output. +.It Fl o Ar path , Fl \-output Ns = Ns Ar path +After reading the full input; write the output to the file at +.Pa path +(creating it if it does not already exist, discarding its previous contents if +it already existed). +The output file can be one of the input files. +This option is incompatible with +.Fl C +and +.Fl c . +.It Fl r , \-reverse +Compare the lines in reverse order. +.It Fl u , \-unique +Don't write a line if it is equal to the previous line. +.It Fl V , \-version-sort +Sort according to the version string, per +.Xr strverscmp 3 . +.It Fl z , \-zero-terminated +Lines are delimited with the NUL byte (0) instead of the newline byte (10). +.El +.Sh IMPLEMENTATION NOTES +In the event of an input error, +.Nm +will write an error to the standard error and exit unsuccessfully. +.Pp +.Nm +reads the whole input into memory, rather than storing intermediate sorting +steps in the filesystem, and requires enough memory to store a copy of the whole +input. +.Sh ENVIRONMENT +.Bl -tag -width "LC_COLLATE" +.It Dv LANG +The default locale for locale variables that are unset or null. +.It Dv LC_ALL +Overrides all the other locale variables if set. +.It Dv LC_COLLATE +Compare the input according to this locale's collating rules using +.Xr strcoll 3 . +.El +.Sh EXIT STATUS +.Nm +will exit 0 on success, exit 1 if the input was out of order when +.Fl C +or +.Fl c , +or exit 2 (or higher) otherwise. +.Sh EXAMPLES +Read lines from the standard input and write them in sorted order to the +standard output: +.Bd -literal +sort < input > output +.Ed +.Pp +Read lines from the three specified files (where the second happens to be the +standard input) and write them in sorted to the standard output: +.Bd -literal +grep pattern lines.txt | sort foo - bar -o output.txt +.Ed +.Pp +Sort the input file if it isn't already sorted: +.Bd -literal +if sort -C file; [ $? = 1 ]; then + sort file -o file +fi +.Ed +.Pp +Remove duplicate lines from the input by sorting it and removing lines equal to +the previous line: +.Bd -literal +sort -u +.Ed +.Sh SEE ALSO +.Xr cat 1 , +.Xr comm 1 , +.Xr join 1 , +.Xr uniq 1 , +.Xr qsort 3 , +.Xr strcoll 3 , +.Xr strverscmp 3 +.Sh STANDARDS +.Nm +is standardized in +.St -p1003.1-2008 , +which is currently partially implemented in this implementation of +.Nm . +.Pp +The +.Fl V +and +.Fl z +options, as well as the long options, are extensions also found in GNU +coreutils. +.Pp +As an extension, the +.Fl C +and +.Fl c +options support multiple input files. +.Sh BUGS +The +.St -p1003.1-2008 +options +.Fl b , +.Fl d , +.Fl f , +.Fl i , +.Fl k , +.Fl n , +and +.Fl t +are not currently implemented. +.Pp +The +.Fl m +option is not currently taken advantage of to speed up the sorting, rather the +presorted input files are sorted all over again. diff --git a/utils/sort.c b/utils/sort.c index a5d4d41a..96b1c92e 100644 --- a/utils/sort.c +++ b/utils/sort.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2015 Jonas 'Sortie' Termansen. + * Copyright (c) 2014, 2015, 2018 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -17,8 +17,8 @@ * Sort, merge, or sequence check text files. */ +#include #include -#include #include #include #include @@ -29,55 +29,55 @@ // TODO: Implement all the features mandated by POSIX. // TODO: Implement the useful GNU extensions. -int flip_comparison(int rel) +static int flip_comparison(int rel) { return rel < 0 ? 1 : 0 < rel ? -1 : 0; } -int indirect_compare(int (*compare)(const char*, const char*), - const void* a_ptr, const void* b_ptr) +static int indirect_compare(int (*compare)(const char*, const char*), + const void* a_ptr, const void* b_ptr) { const char* a = *(const char* const*) a_ptr; const char* b = *(const char* const*) b_ptr; return compare(a, b); } -int compare_line(const char* a, const char* b) +static int compare_line(const char* a, const char* b) { return strcoll(a, b); } -int indirect_compare_line(const void* a_ptr, const void* b_ptr) +static int indirect_compare_line(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_line, a_ptr, b_ptr); } -int compare_line_reverse(const char* a, const char* b) +static int compare_line_reverse(const char* a, const char* b) { return flip_comparison(compare_line(a, b)); } -int indirect_compare_line_reverse(const void* a_ptr, const void* b_ptr) +static int indirect_compare_line_reverse(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_line_reverse, a_ptr, b_ptr); } -int compare_version(const char* a, const char* b) +static int compare_version(const char* a, const char* b) { return strverscmp(a, b); } -int indirect_compare_version(const void* a_ptr, const void* b_ptr) +static int indirect_compare_version(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_version, a_ptr, b_ptr); } -int compare_version_reverse(const char* a, const char* b) +static int compare_version_reverse(const char* a, const char* b) { return flip_comparison(compare_version(a, b)); } -int indirect_compare_version_reverse(const void* a_ptr, const void* b_ptr) +static int indirect_compare_version_reverse(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_version_reverse, a_ptr, b_ptr); } @@ -90,10 +90,9 @@ struct input_stream FILE* current_file; const char* last_file_path; uintmax_t last_line_number; - bool result_status; }; -char* read_line(FILE* fp, const char* fpname, int delim) +static char* read_line(FILE* fp, const char* fpname, int delim) { char* line = NULL; size_t line_size = 0; @@ -102,7 +101,7 @@ char* read_line(FILE* fp, const char* fpname, int delim) { free(line); if ( ferror(fp) ) - error(0, errno, "read: `%s'", fpname); + err(2, "read: %s", fpname); return NULL; } if ( (unsigned char) line[amount-1] == (unsigned char) delim ) @@ -110,15 +109,14 @@ char* read_line(FILE* fp, const char* fpname, int delim) return line; } -char* read_input_stream_line(struct input_stream* is, int delim) +static char* read_input_stream_line(struct input_stream* is, int delim) { if ( !is->files_length ) { char* result = read_line(stdin, "", delim); - if ( ferror(stdin) ) - is->result_status = false; is->last_file_path = "-"; - is->last_line_number++; + if ( result ) + is->last_line_number++; return result; } while ( is->files_current < is->files_length ) @@ -130,21 +128,11 @@ char* read_input_stream_line(struct input_stream* is, int delim) if ( !strcmp(path, "-") ) is->current_file = stdin; else if ( !(is->current_file = fopen(path, "r")) ) - { - error(0, errno, "`%s'", path); - is->result_status = false; - is->files_current++; - continue; - } + err(2, "%s", path); } char* result = read_line(is->current_file, path, delim); if ( !result ) { - if ( ferror(is->current_file) ) - { - error(0, errno, "reading: `%s'", path); - is->result_status = false; - } if ( is->current_file != stdin ) fclose(is->current_file); is->current_file = NULL; @@ -158,9 +146,9 @@ char* read_input_stream_line(struct input_stream* is, int delim) return NULL; } -char** read_input_stream_lines(size_t* result_num_lines, - struct input_stream* is, - int delim) +static char** read_input_stream_lines(size_t* result_num_lines, + struct input_stream* is, + int delim) { char** lines = NULL; size_t lines_used = 0; @@ -171,18 +159,13 @@ char** read_input_stream_lines(size_t* result_num_lines, { if ( lines_used == lines_length ) { - size_t new_lines_length = lines_length ? 2 * lines_length : 64; - size_t new_lines_size = sizeof(char*) * new_lines_length; - char** new_lines = (char**) realloc(lines, new_lines_size); + size_t old_lines_length = lines_length ? lines_length : 64; + char** new_lines = (char**) reallocarray(lines, old_lines_length, + 2 * sizeof(char*)); if ( !new_lines ) - { - error(0, errno, "realloc"); - free(line); - is->result_status = false; - return *result_num_lines = lines_used, lines; - } + err(2, "malloc"); lines = new_lines; - lines_length = new_lines_length; + lines_length = 2 * old_lines_length; } lines[lines_used++] = line; } @@ -203,93 +186,6 @@ static void compact_arguments(int* argc, char*** argv) } } -static void help(FILE* fp, const char* argv0) -{ - fprintf(fp, "Usage: %s [OPTION]... [FILE]...\n", argv0); - fprintf(fp, "Write sorted concatenation of all FILE(s) to standard output.\n"); - fprintf(fp, "\n"); - fprintf(fp, "Mandatory arguments to long options are mandatory for short options too.\n"); - fprintf(fp, "Ordering options:\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, " -b, --ignore-leading-blanks ignore leading blanks\n"); - fprintf(fp, " -d, --dictionary-order consider only blanks and alphanumeric characters\n"); - fprintf(fp, " -f, --ignore-case fold lower case to upper case characters\n"); - fprintf(fp, " -g, --general-numeric-sort compare according to general numerical value\n"); - fprintf(fp, " -i, --ignore-nonprinting consider only printable characters\n"); - fprintf(fp, " -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n"); - fprintf(fp, " -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n"); - fprintf(fp, " -n, --numeric-sort compare according to string numerical value\n"); - fprintf(fp, " -R, --random-sort sort by random hash of keys\n"); - fprintf(fp, " --random-source=FILE get random bytes from FILE\n"); -#endif - fprintf(fp, " -r, --reverse reverse the result of comparisons\n"); -#if 0 - fprintf(fp, " --sort=WORD sort according to WORD:\n"); - fprintf(fp, " general-numeric -g, human-numeric -h, month -M,\n"); - fprintf(fp, " numeric -n, random -R, version -V\n"); -#endif - fprintf(fp, " -V, --version-sort natural sort of (version) numbers within text\n"); - fprintf(fp, "\n"); - fprintf(fp, "Other options:\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, " --batch-size=NMERGE merge at most NMERGE inputs at once;\n"); - fprintf(fp, " for more use temp files\n"); -#endif - fprintf(fp, " -c, --check, --check=diagnose-first check for sorted input; do not sort\n"); - fprintf(fp, " -C, --check=quiet, --check=silent like -c, but do not report first bad line\n"); -#if 0 - fprintf(fp, " --compress-program=PROG compress temporaries with PROG;\n"); - fprintf(fp, " decompress them with PROG -d\n"); - fprintf(fp, " --debug annotate the part of the line used to sort,\n"); - fprintf(fp, " and warn about questionable usage to stderr\n"); - fprintf(fp, " --files0-from=F read input from the files specified by\n"); - fprintf(fp, " NUL-terminated names in file F;\n"); - fprintf(fp, " If F is - then read names from standard input\n"); - fprintf(fp, " -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n"); - fprintf(fp, " (default end of line). See POS syntax below\n"); -#endif - fprintf(fp, " -m, --merge merge already sorted files; do not sort\n"); - fprintf(fp, " -o, --output=FILE write result to FILE instead of standard output\n"); -#if 0 - fprintf(fp, " -s, --stable stabilize sort by disabling last-resort comparison\n"); - fprintf(fp, " -S, --buffer-size=SIZE use SIZE for main memory buffer\n"); - fprintf(fp, " -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n"); - fprintf(fp, " -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or /tmp;\n"); - fprintf(fp, " multiple options specify multiple directories\n"); - fprintf(fp, " --parallel=N change the number of sorts run concurrently to N\n"); -#endif - fprintf(fp, " -u, --unique with -c, check for strict ordering;\n"); - fprintf(fp, " without -c, output only the first of an equal run\n"); - fprintf(fp, " -z, --zero-terminated end lines with 0 byte, not newline\n"); - fprintf(fp, " --help display this help and exit\n"); - fprintf(fp, " --version output version information and exit\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, "POS is F[.C][OPTS], where F is the field number and C the character position\n"); - fprintf(fp, "in the field; both are origin 1. If neither -t nor -b is in effect, characters\n"); - fprintf(fp, "in a field are counted from the beginning of the preceding whitespace. OPTS is\n"); - fprintf(fp, "one or more single-letter ordering options, which override global ordering\n"); - fprintf(fp, "options for that key. If no key is given, use the entire line as the key.\n"); - fprintf(fp, "\n"); - fprintf(fp, "SIZE may be followed by the following multiplicative suffixes:\n"); - fprintf(fp, "%% 1%% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n"); - fprintf(fp, "\n"); -#endif - fprintf(fp, "With no FILE, or when FILE is -, read standard input.\n"); - fprintf(fp, "\n"); - fprintf(fp, "*** WARNING ***\n"); - fprintf(fp, "The locale specified by the environment affects sort order.\n"); - fprintf(fp, "Set LC_ALL=C to get the traditional sort order that uses\n"); - fprintf(fp, "native byte values.\n"); -} - -static void version(FILE* fp, const char* argv0) -{ - fprintf(fp, "%s (Sortix) %s\n", argv0, VERSIONSTR); -} - int main(int argc, char* argv[]) { setlocale(LC_ALL, ""); @@ -303,7 +199,6 @@ int main(int argc, char* argv[]) bool version_sort = false; bool zero_terminated = false; - const char* argv0 = argv[0]; for ( int i = 1; i < argc; i++ ) { const char* arg = argv[i]; @@ -317,18 +212,14 @@ int main(int argc, char* argv[]) char c; while ( (c = *++arg) ) switch ( c ) { - case 'c': check = true; break; - case 'C': check = check_quiet = false; break; + case 'C': check = true, check_quiet = true; break; + case 'c': check = true, check_quiet = false; break; case 'm': merge = true; break; case 'o': if ( !*(output = arg + 1) ) { if ( i + 1 == argc ) - { - error(0, 0, "option requires an argument -- 'o'"); - fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]); - exit(125); - } + errx(2, "option requires an argument -- 'o'"); output = argv[i+1]; argv[++i] = NULL; } @@ -339,15 +230,9 @@ int main(int argc, char* argv[]) case 'V': version_sort = true; break; case 'z': zero_terminated = true; break; default: - fprintf(stderr, "%s: unknown option -- '%c'\n", argv0, c); - help(stderr, argv0); - exit(1); + errx(2, "unknown option -- '%c'", c); } } - else if ( !strcmp(arg, "--help") ) - help(stdout, argv0), exit(0); - else if ( !strcmp(arg, "--version") ) - version(stdout, argv0), exit(0); else if ( !strcmp(arg, "--check") || !strcmp(arg, "--check=diagnose-first") ) check = true, check_quiet = false; @@ -361,11 +246,7 @@ int main(int argc, char* argv[]) else if ( !strcmp(arg, "--output") ) { if ( i + 1 == argc ) - { - error(0, 0, "option '--output' requires an argument"); - fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]); - exit(125); - } + errx(2, "option '--output' requires an argument"); output = argv[i+1]; argv[++i] = NULL; } @@ -378,17 +259,15 @@ int main(int argc, char* argv[]) else if ( !strcmp(arg, "--zero-terminated") ) zero_terminated = true; else - { - fprintf(stderr, "%s: unknown option: %s\n", argv0, arg); - help(stderr, argv0); - exit(1); - } + errx(2, "unknown option: %s", arg); } compact_arguments(&argc, &argv); - if ( output && !freopen(output, "w", stdout) ) - error(2, errno, "`%s'", output); + if ( check_quiet && output ) + errx(1, "the -C and -o options are incompatible"); + if ( check && output ) + errx(1, "the -c and -o options are incompatible"); int delim = zero_terminated ? '\0' : '\n'; @@ -413,7 +292,6 @@ int main(int argc, char* argv[]) is.files = (const char* const*) (argv + 1); is.files_current = 0; is.files_length = argc - 1; - is.result_status = true; if ( check ) { @@ -424,10 +302,10 @@ int main(int argc, char* argv[]) { if ( prev_line && compare(line, prev_line) < needed_relation ) { - if ( !check_quiet ) - error(0, errno, "%s:%ju: disorder: %s", is.last_file_path, - is.last_line_number, line); - exit(1); + if ( check_quiet ) + return 1; + errx(1, "%s:%ju: disorder: %s", is.last_file_path, + is.last_line_number, line); } free(prev_line); prev_line = line; @@ -443,14 +321,19 @@ int main(int argc, char* argv[]) qsort(lines, lines_used, sizeof(*lines), qsort_compare); + if ( output && !freopen(output, "w", stdout) ) + err(2, "%s", output); + for ( size_t i = 0; i < lines_used; i++ ) { if ( unique && i && compare(lines[i-1], lines[i]) == 0 ) continue; - fputs(lines[i], stdout); - fputc(delim, stdout); + if ( fputs(lines[i], stdout) == EOF || fputc(delim, stdout) == EOF ) + err(2, "%s", output ? output : ""); } + if ( fflush(stdout) == EOF ) + err(2, "%s", output ? output : ""); } - return is.result_status ? 0 : 2; + return 0; }