diff --git a/utils/Makefile b/utils/Makefile index 6c550e62..a6add9c2 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -82,6 +82,7 @@ memstat.1 \ pager.1 \ passwd.1 \ readlink.1 \ +sort.1 \ uname.1 \ SBINS=\ diff --git a/utils/sort.1 b/utils/sort.1 new file mode 100644 index 00000000..752d6208 --- /dev/null +++ b/utils/sort.1 @@ -0,0 +1,154 @@ +.Dd April 8, 2018 +.Dt SORT 1 +.Os +.Sh NAME +.Nm sort +.Nd sort lines of text +.Sh SYNOPSIS +.Nm +.Op Fl CcmruVz +.Op Fl o Ar path +.Ar +.Sh DESCRIPTION +.Nm +reads lines of text from the standard input and writes the lines in sorted order +to the standard output. +If files are specified, the input is the concatenated content of the files read +in sequential order. +The +.Ar file +path can be set to +.Sq - +to specify the standard input. +The lines are compared according to the current locale's collating rules. +.Pp +The options are as follows: +.Bl -tag -width "12345678" +.It Fl c, \-check, \-check=diagnose-first +Check whether the input is already sorted. +If a line is out of order (or an equal line is found if +.Fl u ) , +write an error describing which line was out of +order and exit 1. +.It Fl C, \-check=quiet, \-check=silent +Same as +.Fl c , +but write no error to the standard output about the input being out order. +.It Fl m, \-merge +Merge the presorted input files into a sorted output. +.It Fl o Ar path , Fl \-output Ns = Ns Ar path +After reading the full input; write the output to the file at +.Pa path +(creating it if it does not already exist, discarding its previous contents if +it already existed). +The output file can be one of the input files. +This option is incompatible with +.Fl C +and +.Fl c . +.It Fl r , \-reverse +Compare the lines in reverse order. +.It Fl u , \-unique +Don't write a line if it is equal to the previous line. +.It Fl V , \-version-sort +Sort according to the version string, per +.Xr strverscmp 3 . +.It Fl z , \-zero-terminated +Lines are delimited with the NUL byte (0) instead of the newline byte (10). +.El +.Sh IMPLEMENTATION NOTES +In the event of an input error, +.Nm +will write an error to the standard error and exit unsuccessfully. +.Pp +.Nm +reads the whole input into memory, rather than storing intermediate sorting +steps in the filesystem, and requires enough memory to store a copy of the whole +input. +.Sh ENVIRONMENT +.Bl -tag -width "LC_COLLATE" +.It Dv LANG +The default locale for locale variables that are unset or null. +.It Dv LC_ALL +Overrides all the other locale variables if set. +.It Dv LC_COLLATE +Compare the input according to this locale's collating rules using +.Xr strcoll 3 . +.El +.Sh EXIT STATUS +.Nm +will exit 0 on success, exit 1 if the input was out of order when +.Fl C +or +.Fl c , +or exit 2 (or higher) otherwise. +.Sh EXAMPLES +Read lines from the standard input and write them in sorted order to the +standard output: +.Bd -literal +sort < input > output +.Ed +.Pp +Read lines from the three specified files (where the second happens to be the +standard input) and write them in sorted to the standard output: +.Bd -literal +grep pattern lines.txt | sort foo - bar -o output.txt +.Ed +.Pp +Sort the input file if it isn't already sorted: +.Bd -literal +if sort -C file; [ $? = 1 ]; then + sort file -o file +fi +.Ed +.Pp +Remove duplicate lines from the input by sorting it and removing lines equal to +the previous line: +.Bd -literal +sort -u +.Ed +.Sh SEE ALSO +.Xr cat 1 , +.Xr comm 1 , +.Xr join 1 , +.Xr uniq 1 , +.Xr qsort 3 , +.Xr strcoll 3 , +.Xr strverscmp 3 +.Sh STANDARDS +.Nm +is standardized in +.St -p1003.1-2008 , +which is currently partially implemented in this implementation of +.Nm . +.Pp +The +.Fl V +and +.Fl z +options, as well as the long options, are extensions also found in GNU +coreutils. +.Pp +As an extension, the +.Fl C +and +.Fl c +options support multiple input files. +.Sh BUGS +The +.St -p1003.1-2008 +options +.Fl b , +.Fl d , +.Fl f , +.Fl i , +.Fl k , +.Fl n , +and +.Fl t +are not currently implemented. +.Pp +The +.Fl m +option is not currently taken advantage of to speed up the sorting, rather the +presorted input files are sorted all over again. diff --git a/utils/sort.c b/utils/sort.c index a5d4d41a..96b1c92e 100644 --- a/utils/sort.c +++ b/utils/sort.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2015 Jonas 'Sortie' Termansen. + * Copyright (c) 2014, 2015, 2018 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -17,8 +17,8 @@ * Sort, merge, or sequence check text files. */ +#include #include -#include #include #include #include @@ -29,55 +29,55 @@ // TODO: Implement all the features mandated by POSIX. // TODO: Implement the useful GNU extensions. -int flip_comparison(int rel) +static int flip_comparison(int rel) { return rel < 0 ? 1 : 0 < rel ? -1 : 0; } -int indirect_compare(int (*compare)(const char*, const char*), - const void* a_ptr, const void* b_ptr) +static int indirect_compare(int (*compare)(const char*, const char*), + const void* a_ptr, const void* b_ptr) { const char* a = *(const char* const*) a_ptr; const char* b = *(const char* const*) b_ptr; return compare(a, b); } -int compare_line(const char* a, const char* b) +static int compare_line(const char* a, const char* b) { return strcoll(a, b); } -int indirect_compare_line(const void* a_ptr, const void* b_ptr) +static int indirect_compare_line(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_line, a_ptr, b_ptr); } -int compare_line_reverse(const char* a, const char* b) +static int compare_line_reverse(const char* a, const char* b) { return flip_comparison(compare_line(a, b)); } -int indirect_compare_line_reverse(const void* a_ptr, const void* b_ptr) +static int indirect_compare_line_reverse(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_line_reverse, a_ptr, b_ptr); } -int compare_version(const char* a, const char* b) +static int compare_version(const char* a, const char* b) { return strverscmp(a, b); } -int indirect_compare_version(const void* a_ptr, const void* b_ptr) +static int indirect_compare_version(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_version, a_ptr, b_ptr); } -int compare_version_reverse(const char* a, const char* b) +static int compare_version_reverse(const char* a, const char* b) { return flip_comparison(compare_version(a, b)); } -int indirect_compare_version_reverse(const void* a_ptr, const void* b_ptr) +static int indirect_compare_version_reverse(const void* a_ptr, const void* b_ptr) { return indirect_compare(compare_version_reverse, a_ptr, b_ptr); } @@ -90,10 +90,9 @@ struct input_stream FILE* current_file; const char* last_file_path; uintmax_t last_line_number; - bool result_status; }; -char* read_line(FILE* fp, const char* fpname, int delim) +static char* read_line(FILE* fp, const char* fpname, int delim) { char* line = NULL; size_t line_size = 0; @@ -102,7 +101,7 @@ char* read_line(FILE* fp, const char* fpname, int delim) { free(line); if ( ferror(fp) ) - error(0, errno, "read: `%s'", fpname); + err(2, "read: %s", fpname); return NULL; } if ( (unsigned char) line[amount-1] == (unsigned char) delim ) @@ -110,15 +109,14 @@ char* read_line(FILE* fp, const char* fpname, int delim) return line; } -char* read_input_stream_line(struct input_stream* is, int delim) +static char* read_input_stream_line(struct input_stream* is, int delim) { if ( !is->files_length ) { char* result = read_line(stdin, "", delim); - if ( ferror(stdin) ) - is->result_status = false; is->last_file_path = "-"; - is->last_line_number++; + if ( result ) + is->last_line_number++; return result; } while ( is->files_current < is->files_length ) @@ -130,21 +128,11 @@ char* read_input_stream_line(struct input_stream* is, int delim) if ( !strcmp(path, "-") ) is->current_file = stdin; else if ( !(is->current_file = fopen(path, "r")) ) - { - error(0, errno, "`%s'", path); - is->result_status = false; - is->files_current++; - continue; - } + err(2, "%s", path); } char* result = read_line(is->current_file, path, delim); if ( !result ) { - if ( ferror(is->current_file) ) - { - error(0, errno, "reading: `%s'", path); - is->result_status = false; - } if ( is->current_file != stdin ) fclose(is->current_file); is->current_file = NULL; @@ -158,9 +146,9 @@ char* read_input_stream_line(struct input_stream* is, int delim) return NULL; } -char** read_input_stream_lines(size_t* result_num_lines, - struct input_stream* is, - int delim) +static char** read_input_stream_lines(size_t* result_num_lines, + struct input_stream* is, + int delim) { char** lines = NULL; size_t lines_used = 0; @@ -171,18 +159,13 @@ char** read_input_stream_lines(size_t* result_num_lines, { if ( lines_used == lines_length ) { - size_t new_lines_length = lines_length ? 2 * lines_length : 64; - size_t new_lines_size = sizeof(char*) * new_lines_length; - char** new_lines = (char**) realloc(lines, new_lines_size); + size_t old_lines_length = lines_length ? lines_length : 64; + char** new_lines = (char**) reallocarray(lines, old_lines_length, + 2 * sizeof(char*)); if ( !new_lines ) - { - error(0, errno, "realloc"); - free(line); - is->result_status = false; - return *result_num_lines = lines_used, lines; - } + err(2, "malloc"); lines = new_lines; - lines_length = new_lines_length; + lines_length = 2 * old_lines_length; } lines[lines_used++] = line; } @@ -203,93 +186,6 @@ static void compact_arguments(int* argc, char*** argv) } } -static void help(FILE* fp, const char* argv0) -{ - fprintf(fp, "Usage: %s [OPTION]... [FILE]...\n", argv0); - fprintf(fp, "Write sorted concatenation of all FILE(s) to standard output.\n"); - fprintf(fp, "\n"); - fprintf(fp, "Mandatory arguments to long options are mandatory for short options too.\n"); - fprintf(fp, "Ordering options:\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, " -b, --ignore-leading-blanks ignore leading blanks\n"); - fprintf(fp, " -d, --dictionary-order consider only blanks and alphanumeric characters\n"); - fprintf(fp, " -f, --ignore-case fold lower case to upper case characters\n"); - fprintf(fp, " -g, --general-numeric-sort compare according to general numerical value\n"); - fprintf(fp, " -i, --ignore-nonprinting consider only printable characters\n"); - fprintf(fp, " -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n"); - fprintf(fp, " -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n"); - fprintf(fp, " -n, --numeric-sort compare according to string numerical value\n"); - fprintf(fp, " -R, --random-sort sort by random hash of keys\n"); - fprintf(fp, " --random-source=FILE get random bytes from FILE\n"); -#endif - fprintf(fp, " -r, --reverse reverse the result of comparisons\n"); -#if 0 - fprintf(fp, " --sort=WORD sort according to WORD:\n"); - fprintf(fp, " general-numeric -g, human-numeric -h, month -M,\n"); - fprintf(fp, " numeric -n, random -R, version -V\n"); -#endif - fprintf(fp, " -V, --version-sort natural sort of (version) numbers within text\n"); - fprintf(fp, "\n"); - fprintf(fp, "Other options:\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, " --batch-size=NMERGE merge at most NMERGE inputs at once;\n"); - fprintf(fp, " for more use temp files\n"); -#endif - fprintf(fp, " -c, --check, --check=diagnose-first check for sorted input; do not sort\n"); - fprintf(fp, " -C, --check=quiet, --check=silent like -c, but do not report first bad line\n"); -#if 0 - fprintf(fp, " --compress-program=PROG compress temporaries with PROG;\n"); - fprintf(fp, " decompress them with PROG -d\n"); - fprintf(fp, " --debug annotate the part of the line used to sort,\n"); - fprintf(fp, " and warn about questionable usage to stderr\n"); - fprintf(fp, " --files0-from=F read input from the files specified by\n"); - fprintf(fp, " NUL-terminated names in file F;\n"); - fprintf(fp, " If F is - then read names from standard input\n"); - fprintf(fp, " -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n"); - fprintf(fp, " (default end of line). See POS syntax below\n"); -#endif - fprintf(fp, " -m, --merge merge already sorted files; do not sort\n"); - fprintf(fp, " -o, --output=FILE write result to FILE instead of standard output\n"); -#if 0 - fprintf(fp, " -s, --stable stabilize sort by disabling last-resort comparison\n"); - fprintf(fp, " -S, --buffer-size=SIZE use SIZE for main memory buffer\n"); - fprintf(fp, " -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n"); - fprintf(fp, " -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or /tmp;\n"); - fprintf(fp, " multiple options specify multiple directories\n"); - fprintf(fp, " --parallel=N change the number of sorts run concurrently to N\n"); -#endif - fprintf(fp, " -u, --unique with -c, check for strict ordering;\n"); - fprintf(fp, " without -c, output only the first of an equal run\n"); - fprintf(fp, " -z, --zero-terminated end lines with 0 byte, not newline\n"); - fprintf(fp, " --help display this help and exit\n"); - fprintf(fp, " --version output version information and exit\n"); - fprintf(fp, "\n"); -#if 0 - fprintf(fp, "POS is F[.C][OPTS], where F is the field number and C the character position\n"); - fprintf(fp, "in the field; both are origin 1. If neither -t nor -b is in effect, characters\n"); - fprintf(fp, "in a field are counted from the beginning of the preceding whitespace. OPTS is\n"); - fprintf(fp, "one or more single-letter ordering options, which override global ordering\n"); - fprintf(fp, "options for that key. If no key is given, use the entire line as the key.\n"); - fprintf(fp, "\n"); - fprintf(fp, "SIZE may be followed by the following multiplicative suffixes:\n"); - fprintf(fp, "%% 1%% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n"); - fprintf(fp, "\n"); -#endif - fprintf(fp, "With no FILE, or when FILE is -, read standard input.\n"); - fprintf(fp, "\n"); - fprintf(fp, "*** WARNING ***\n"); - fprintf(fp, "The locale specified by the environment affects sort order.\n"); - fprintf(fp, "Set LC_ALL=C to get the traditional sort order that uses\n"); - fprintf(fp, "native byte values.\n"); -} - -static void version(FILE* fp, const char* argv0) -{ - fprintf(fp, "%s (Sortix) %s\n", argv0, VERSIONSTR); -} - int main(int argc, char* argv[]) { setlocale(LC_ALL, ""); @@ -303,7 +199,6 @@ int main(int argc, char* argv[]) bool version_sort = false; bool zero_terminated = false; - const char* argv0 = argv[0]; for ( int i = 1; i < argc; i++ ) { const char* arg = argv[i]; @@ -317,18 +212,14 @@ int main(int argc, char* argv[]) char c; while ( (c = *++arg) ) switch ( c ) { - case 'c': check = true; break; - case 'C': check = check_quiet = false; break; + case 'C': check = true, check_quiet = true; break; + case 'c': check = true, check_quiet = false; break; case 'm': merge = true; break; case 'o': if ( !*(output = arg + 1) ) { if ( i + 1 == argc ) - { - error(0, 0, "option requires an argument -- 'o'"); - fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]); - exit(125); - } + errx(2, "option requires an argument -- 'o'"); output = argv[i+1]; argv[++i] = NULL; } @@ -339,15 +230,9 @@ int main(int argc, char* argv[]) case 'V': version_sort = true; break; case 'z': zero_terminated = true; break; default: - fprintf(stderr, "%s: unknown option -- '%c'\n", argv0, c); - help(stderr, argv0); - exit(1); + errx(2, "unknown option -- '%c'", c); } } - else if ( !strcmp(arg, "--help") ) - help(stdout, argv0), exit(0); - else if ( !strcmp(arg, "--version") ) - version(stdout, argv0), exit(0); else if ( !strcmp(arg, "--check") || !strcmp(arg, "--check=diagnose-first") ) check = true, check_quiet = false; @@ -361,11 +246,7 @@ int main(int argc, char* argv[]) else if ( !strcmp(arg, "--output") ) { if ( i + 1 == argc ) - { - error(0, 0, "option '--output' requires an argument"); - fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]); - exit(125); - } + errx(2, "option '--output' requires an argument"); output = argv[i+1]; argv[++i] = NULL; } @@ -378,17 +259,15 @@ int main(int argc, char* argv[]) else if ( !strcmp(arg, "--zero-terminated") ) zero_terminated = true; else - { - fprintf(stderr, "%s: unknown option: %s\n", argv0, arg); - help(stderr, argv0); - exit(1); - } + errx(2, "unknown option: %s", arg); } compact_arguments(&argc, &argv); - if ( output && !freopen(output, "w", stdout) ) - error(2, errno, "`%s'", output); + if ( check_quiet && output ) + errx(1, "the -C and -o options are incompatible"); + if ( check && output ) + errx(1, "the -c and -o options are incompatible"); int delim = zero_terminated ? '\0' : '\n'; @@ -413,7 +292,6 @@ int main(int argc, char* argv[]) is.files = (const char* const*) (argv + 1); is.files_current = 0; is.files_length = argc - 1; - is.result_status = true; if ( check ) { @@ -424,10 +302,10 @@ int main(int argc, char* argv[]) { if ( prev_line && compare(line, prev_line) < needed_relation ) { - if ( !check_quiet ) - error(0, errno, "%s:%ju: disorder: %s", is.last_file_path, - is.last_line_number, line); - exit(1); + if ( check_quiet ) + return 1; + errx(1, "%s:%ju: disorder: %s", is.last_file_path, + is.last_line_number, line); } free(prev_line); prev_line = line; @@ -443,14 +321,19 @@ int main(int argc, char* argv[]) qsort(lines, lines_used, sizeof(*lines), qsort_compare); + if ( output && !freopen(output, "w", stdout) ) + err(2, "%s", output); + for ( size_t i = 0; i < lines_used; i++ ) { if ( unique && i && compare(lines[i-1], lines[i]) == 0 ) continue; - fputs(lines[i], stdout); - fputc(delim, stdout); + if ( fputs(lines[i], stdout) == EOF || fputc(delim, stdout) == EOF ) + err(2, "%s", output ? output : ""); } + if ( fflush(stdout) == EOF ) + err(2, "%s", output ? output : ""); } - return is.result_status ? 0 : 2; + return 0; }