#!/usr/bin/perl -w # # access_log_domains # # AUTHOR: # Dan Harkless # # COPYRIGHT: # This file is Copyright (C) 2008 by Dan Harkless, and is released under the # GNU General Public License . # # USAGE: # % access_log_domains [-f] [...] # # DESCRIPTION: # Outputs all the unique domains found among website visitors in an Apache # access_log. For each one, outputs the hostname and timestamp of the last # visitor from that domain, and optionally the hostname / timestamp of the # first visitor as well. # # If multiple (e.g. logrotated) access_log files are fed to # access_log_domains, they must be specified in oldest-to-newest order for the # first and last visitor output to come out correctly (first and last are # determined by order of appearance, not by parsing and comparing timestamps). # # access_log_domains correctly handles two-level ccTLDs (see # or # ). For example, with hostnames # harkless.org, cow.boy.com, www.foo.bar.com, ushi.otoko.co.jp, and # www.nantoka.nanika.co.jp, it will correctly realize that the domains are # harkless.org, boy.com, bar.com, otoko.co.jp, and nanika.co.jp. # # No attempt is made to reverse lookup IP addresses that could not be reverse # looked up at the time of logging. # # COMMANDLINE OPTIONS: # -f # After the domain and colon, output the hostname and timestamp of the # first visitor from the domain. After that, if it's different, the usual # hostname / timestamp of the _last_ visitor from the domain is printed. # # EXAMPLE: # % zcat -f access_log.2.gz access_log.1.gz access_log | access_log_domains -f # # DATE MODIFICATION # ========== ================================================================== # 2008-09-02 "use English qw(-no_match_vars)": avoid regex performance penalty. # 2005-01-31 Original. ## Modules used ################################################################ use English qw(-no_match_vars); # allow use of names like @ARG rather than @_ use File::Basename; # for basename() use Getopt::Std; # for getopts() ## Main ######################################################################## # Gathered from 2005-01-31: %two_level_ccTLDs = ("com.ac"=>1, "edu.ac"=>1, "gov.ac"=>1, "net.ac"=>1, "mil.ac"=>1, "org.ac"=>1, "com.ae"=>1, "net.ae"=>1, "org.ae"=>1, "com.ar"=>1, "net.ar"=>1, "org.ar"=>1, "co.at"=>1, "ac.at"=>1, "com.au"=>1, "org.au"=>1, "gov.au"=>1, "org.au"=>1, "id.au"=>1, "oz.au"=>1, "info.au"=>1, "net.au"=>1, "asn.au"=>1, "csiro.au"=>1, "telememo.au"=>1, "conf.au"=>1, "com.az"=>1, "net.az"=>1, "org.az"=>1, "com.bb"=>1, "net.bb"=>1, "org.bb"=>1, "com.bm"=>1, "edu.bm"=>1, "gov.bm"=>1, "org.bm"=>1, "net.bm"=>1, "art.br"=>1, "com.br"=>1, "esp.br"=>1, "etc.br"=>1, "g12.br"=>1, "gov.br"=>1, "ind.br"=>1, "inf.br"=>1, "mil.br"=>1, "net.br"=>1, "org.br"=>1, "psi.br"=>1, "rec.br"=>1, "tmp.br"=>1, "com.bs"=>1, "net.bs"=>1, "org.bs"=>1, "ab.ca"=>1, "bc.ca"=>1, "mb.ca"=>1, "nb.ca"=>1, "nf.ca"=>1, "ns.ca"=>1, "nt.ca"=>1, "on.ca"=>1, "pe.ca"=>1, "qc.ca"=>1, "sk.ca"=>1, "yk.ca"=>1, "co.ck"=>1, "ac.cn"=>1, "com.cn"=>1, "edu.cn"=>1, "gov.cn"=>1, "net.cn"=>1, "org.cn"=>1, "bj.cn"=>1, "sh.cn"=>1, "tj.cn"=>1, "cq.cn"=>1, "he.cn"=>1, "sx.cn"=>1, "nm.cn"=>1, "ln.cn"=>1, "jl.cn"=>1, "hl.cn"=>1, "js.cn"=>1, "zj.cn"=>1, "ah.cn"=>1, "hb.cn"=>1, "hn.cn"=>1, "gd.cn"=>1, "gx.cn"=>1, "hi.cn"=>1, "sc.cn"=>1, "gz.cn"=>1, "yn.cn"=>1, "xz.cn"=>1, "sn.cn"=>1, "gs.cn"=>1, "qh.cn"=>1, "nx.cn"=>1, "xj.cn"=>1, "tw.cn"=>1, "hk.cn"=>1, "mo.cn"=>1, "arts.co"=>1, "com.co"=>1, "edu.co"=>1, "firm.co"=>1, "gov.co"=>1, "info.co"=>1, "int.co"=>1, "nom.co"=>1, "mil.co"=>1, "org.co"=>1, "rec.co"=>1, "store.co"=>1, "web.co"=>1, "ac.cr"=>1, "co.cr"=>1, "ed.cr"=>1, "fi.cr"=>1, "go.cr"=>1, "or.cr"=>1, "sa.cr"=>1, "com.cu"=>1, "net.cu"=>1, "org.cu"=>1, "ac.cy"=>1, "com.cy"=>1, "gov.cy"=>1, "net.cy"=>1, "org.cy"=>1, "art.do"=>1, "com.do"=>1, "edu.do"=>1, "gov.do"=>1, "org.do"=>1, "mil.do"=>1, "net.do"=>1, "web.do"=>1, "com.ec"=>1, "k12.ec"=>1, "edu.ec"=>1, "fin.ec"=>1, "med.ec"=>1, "gov.ec"=>1, "mil.ec"=>1, "org.ec"=>1, "net.ec"=>1, "com.eg"=>1, "edu.eg"=>1, "eun.eg"=>1, "gov.eg"=>1, "net.eg"=>1, "org.eg"=>1, "sci.eg"=>1, "ac.fj"=>1, "com.fj"=>1, "gov.fj"=>1, "id.fj"=>1, "org.fj"=>1, "school.fj"=>1, "com.ge"=>1, "edu.ge"=>1, "gov.ge"=>1, "mil.ge"=>1, "net.ge"=>1, "org.ge"=>1, "pvt.ge"=>1, "co.gg"=>1, "org.gg"=>1, "sch.gg"=>1, "ac.gg"=>1, "gov.gg"=>1, "ltd.gg"=>1, "ind.gg"=>1, "net.gg"=>1, "alderney.gg"=>1, "guernsey.gg"=>1, "sark.gg"=>1, "edu.gu"=>1, "com.gu"=>1, "mil.gu"=>1, "gov.gu"=>1, "net.gu"=>1, "org.gu"=>1, "com.hk"=>1, "org.hk"=>1, "net.hk"=>1, "co.hu"=>1, "org.hu"=>1, "priv.hu"=>1, "info.hu"=>1, "tm.hu"=>1, "nui.hu"=>1, "ac.id"=>1, "co.id"=>1, "go.id"=>1, "mil.id"=>1, "net.id"=>1, "or.id"=>1, "k12.il"=>1, "org.il"=>1, "ac.il"=>1, "gov.il"=>1, "muni.il"=>1, "co.il"=>1, "net.il"=>1, "co.im"=>1, "lkd.co.im"=>1, "plc.co.im"=>1, "net.im"=>1, "gov.im"=>1, "org.im"=>1, "nic.im"=>1, "ac.im"=>1, "ernet.in"=>1, "nic.in"=>1, "ac.in"=>1, "co.in"=>1, "gov.in"=>1, "net.in"=>1, "res.in"=>1, "co.je"=>1, "org.je"=>1, "sch.je"=>1, "ac.je"=>1, "gov.je"=>1, "ltd.je"=>1, "ind.je"=>1, "net.je"=>1, "jersey.je"=>1, "com.jo"=>1, "gov.jo"=>1, "edu.jo"=>1, "net.jo"=>1, "ad.jp"=>1, "ac.jp"=>1, "co.jp"=>1, "net.jp"=>1, "org.jp"=>1, "gov.jp"=>1, "com.kh"=>1, "net.kh"=>1, "org.kh"=>1, "ac.kr"=>1, "co.kr"=>1, "go.kr"=>1, "or.kr"=>1, "re.kr"=>1, "nm.kr"=>1, "com.la"=>1, "net.la"=>1, "org.la"=>1, "com.lb"=>1, "org.lb"=>1, "net.lb"=>1, "gov.lb"=>1, "mil.lb"=>1, "com.lc"=>1, "edu.lc"=>1, "gov.lc"=>1, "net.lc"=>1, "org.lc"=>1, "com.lv"=>1, "edu.lv"=>1, "gov.lv"=>1, "org.lv"=>1, "mil.lv"=>1, "id.lv"=>1, "net.lv"=>1, "asn.lv"=>1, "conf.lv"=>1, "com.ly"=>1, "net.ly"=>1, "org.ly"=>1, "edu.mm"=>1, "com.mm"=>1, "gov.mm"=>1, "net.mm"=>1, "org.mm"=>1, "com.mo"=>1, "edu.mo"=>1, "gov.mo"=>1, "net.mo"=>1, "org.mo"=>1, "com.mt"=>1, "net.mt"=>1, "org.mt"=>1, "com.mx"=>1, "net.mx"=>1, "org.mx"=>1, "com.my"=>1, "org.my"=>1, "gov.my"=>1, "edu.my"=>1, "net.my"=>1, "com.na"=>1, "org.na"=>1, "net.na"=>1, "com.nc"=>1, "net.nc"=>1, "org.nc"=>1, "com.ni"=>1, "com.np"=>1, "net.np"=>1, "ort.np"=>1, "co.nz"=>1, "org.nz"=>1, "net.nz"=>1, "govt.nz"=>1, "ac.nz"=>1, "gen.nz"=>1, "ac.pa"=>1, "com.pa"=>1, "net.pa"=>1, "org.pa"=>1, "edu.pa"=>1, "gob.pa"=>1, "sld.pa"=>1, "com.pe"=>1, "net.pe"=>1, "org.pe"=>1, "com.ph"=>1, "net.ph"=>1, "org.ph"=>1, "mil.ph"=>1, "ngo.ph"=>1, "com.pl"=>1, "net.pl"=>1, "org.pl"=>1, "com.py"=>1, "net.py"=>1, "org.py"=>1, "edu.py"=>1, "com.ru"=>1, "net.ru"=>1, "org.ru"=>1, "com.sg"=>1, "net.sg"=>1, "org.sg"=>1, "edu.sg"=>1, "gov.sg"=>1, "com.sh"=>1, "edu.sh"=>1, "gov.sh"=>1, "net.sh"=>1, "mil.sh"=>1, "org.sh"=>1, "co.sv"=>1, "com.sy"=>1, "net.sy"=>1, "org.sy"=>1, "ac.th"=>1, "co.th"=>1, "go.th"=>1, "net.th"=>1, "or.th"=>1, "com.tn"=>1, "ind.tn"=>1, "tourism.tn"=>1, "fin.tn"=>1, "net.tn"=>1, "gov.tn"=>1, "nat.tn"=>1, "org.tn"=>1, "info.tn"=>1, "ens.tn"=>1, "intl.tn"=>1, "rnrt.tn"=>1, "rnu.tn"=>1, "rns.tn"=>1, "edunet.tn"=>1, "bbs.tr"=>1, "com.tr"=>1, "edu.tr"=>1, "gov.tr"=>1, "k12.tr"=>1, "mil.tr"=>1, "net.tr"=>1, "org.tr"=>1, "com.tw"=>1, "net.tw"=>1, "org.tw"=>1, "edu.tw"=>1, "gove.tw"=>1, "com.ua"=>1, "net.ua"=>1, "gov.ua"=>1, "ac.ug"=>1, "co.ug"=>1, "or.ug"=>1, "go.ug"=>1, "co.uk"=>1, "org.uk"=>1, "ltd.uk"=>1, "plc.uk"=>1, "net.uk"=>1, "sch.uk"=>1, "ac.uk"=>1, "gov.uk"=>1, "nhs.uk"=>1, "police.uk"=>1, "mod.uk"=>1, "com.uy"=>1, "edu.uy"=>1, "net.uy"=>1, "org.uy"=>1, "com.ve"=>1, "edu.ve"=>1, "gov.ve"=>1, "net.ve"=>1, "co.ve"=>1, "bib.ve"=>1, "tec.ve"=>1, "int.ve"=>1, "org.ve"=>1, "firm.ve"=>1, "store.ve"=>1, "web.ve"=>1, "arts.ve"=>1, "rec.ve"=>1, "info.ve"=>1, "nom.ve"=>1, "mil.ve"=>1, "co.vi"=>1, "net.vi"=>1, "org.vi"=>1, "ac.yu"=>1, "co.yu"=>1, "edu.yu"=>1, "org.yu"=>1, "ac.za"=>1, "alt.za"=>1, "co.za"=>1, "edu.za"=>1, "gov.za"=>1, "mil.za"=>1, "net.za"=>1, "ngo.za"=>1, "nom.za"=>1, "org.za"=>1, "school.za"=>1, "tm.za"=>1, "web.za"=>1); $progname = basename($PROGRAM_NAME); use vars qw($opt_f); # eliminate "used only once" warning if (not getopts('f')) { print STDERR "Usage: $progname [-f] [...]\n"; exit 1; } while (<>) { if (/^([^ ]+\.[a-zA-Z][^ ]+) [^[]+(\[[^]]+])/) { $hostname = $1; $timestamp = $2; @components = split /\./, $hostname; $last_2 = join '.', @components[$#components - 1 .. $#components]; if ($#components >= 2 and $two_level_ccTLDs{$last_2}) { $domain = join '.', @components[$#components - 2 .. $#components]; } else { $domain = $last_2; } if ($opt_f and not $first_visit{$domain}) { $first_visit{$domain} = "$hostname $timestamp"; } $last_visit{$domain} = "$hostname $timestamp"; } } foreach $domain (sort keys %last_visit) { print "$domain:"; if ($opt_f) { print " $first_visit{$domain}"; } if (not $opt_f or ($last_visit{$domain} ne $first_visit{$domain})) { print " $last_visit{$domain}"; } print "\n"; }