#!/usr/bin/perl # # count # # AUTHOR: # Dan Harkless # # COPYRIGHT: # This file is Copyright (C) 2012 by Dan Harkless, and is released under the # GNU General Public License . # # USAGE: # % count [-c] [-n] [-r] [...] # # DESCRIPTION: # Like 'sort -u', but with instance counts. # # More specifically, takes a list of files on the commandline (or reads stdin # if none are specified), and for each unique input line, outputs that line # once, in sorted order, along with the count of times it appeared on input # (with a TAB separating data and count). # # COMMANDLINE OPTIONS: # -c # In the default mode, output lines are of the form "\t", # and are sorted by the text. With -c, output lines are of the # form "\t", and the primary sort key is the # text, with the secondary sort key the text. # # -n # Causes input lines to be sorted in numerical rather than ASCII order. In # this case, "numerical" means not just integers, but strings of numbers # with an arbitrary number of '.' separators. This can mean floating-point # numbers, dotted-quad IP addresses, etc. # # -r # Sort in reverse (descending rather than ascending) order. Note that in -c # mode, this also applies to the sorting of the subkey for same- # lines. # # DATE MODIFICATION # ========== ================================================================== # 2012-05-24 Read that $a and $b in sort functions aren't supposed to use 'my'. # 2008-09-02 "use English qw(-no_match_vars)": avoid regex performance penalty. # 2004-02-28 Be tolerant of -n being used on non-numeric data, as 'sort -n' is. # Don't emit warnings; just fall back to ASCII sort on those lines. # 2004-02-27 Original. ## Modules used ################################################################ use English qw(-no_match_vars); # allow use of names like @ARG rather than @_ use File::Basename; # for basename() use Getopt::Std; # for getopts() use warnings; # output warnings for this script but not for modules used ## Subroutines ################################################################# sub ASCII_cmp { if ($opt_r) { $b = shift; $a = shift; } else { $a = shift; $b = shift; } return $a cmp $b; } sub dotted_number_cmp { if ($opt_r) { $b = shift; $a = shift; } else { $a = shift; $b = shift; } my @a_components = split(/\./, $a); my @b_components = split(/\./, $b); my $min_components = ($#a_components < $#b_components ? $#a_components : $#b_components); for (my $i = 0; $i <= $min_components; $i++) { if ($a_components[$i] !~ /^\d+$/ or $b_components[$i] !~ /^\d+$/) { if ($a_components[$i] ne $b_components[$i]) { return $a_components[$i] cmp $b_components[$i]; } } else { if ($a_components[$i] != $b_components[$i]) { return $a_components[$i] <=> $b_components[$i]; } } } if ($#a_components != $#b_components) { return $#a_components <=> $#b_components; } return 0; } ## Main ######################################################################## $progname = basename($PROGRAM_NAME); use vars qw($opt_c $opt_n $opt_r); # eliminate "used only once" warning if (not getopts('cnr')) { print STDERR "Usage: $progname [-c] [-n] [-r] [...]\n"; exit 1; } while (<>) { chomp $ARG; $count{$ARG}++; } if ($opt_c) { foreach $key (sort {dotted_number_cmp($count{$a}, $count{$b}) || ($opt_n ? dotted_number_cmp($a, $b) : ASCII_cmp($a, $b));} keys %count) { print "$count{$key}\t$key\n"; } } else { foreach $key (sort {($opt_n ? dotted_number_cmp($a, $b) : ASCII_cmp($a, $b));} keys %count) { print "$key\t$count{$key}\n"; } }