#!/usr/bin/perl -s # tsvdesc - describe columns of tab-separated text # Steve Kinzler, kinzler@cs.indiana.edu, Jan 05/Mar 08 # http://www.cs.indiana.edu/~kinzler/home.html#unix $usage = "usage: $0 [ -t ] [ -d ] [ -n ] [ file ... ] -t interpret the first row as column titles -d don't count duplicates in the columns -n leave , and in strings as literal\n"; die $usage if $h; while (<>) { s/[\r\n]*$//; @_ = split(/\t/, $_); @lbl = @_, next if $. == 1 && $t; map { s//\t/gs, s//\n/gs, s//\r/gs } @_ unless $n; $c = 1; foreach (@_) { $quot{$c}++ if s/^"(.*)"$/$1/; $spac{$c}++ if s/^\s+// || s/\s+$//; $dups{$c}++ if ! $u && $data{"$c,$_"}++; $maxl{$c} = &max($maxl{$c}, $l = length($_)); $minl{$c} = &min($minl{$c}, $l); $ncol = &max($ncol, $c++); } } print $t ? $. - 1 : $., " lines\n"; for $c (1 .. $ncol) { print shift @lbl || "Column_$c", "\tVARCHAR2(", $maxl{$c} + 0, ")\t", "minlen ", $minl{$c} + 0, "\t", $u ? () : ($dups{$c} + 0, " dups\t"), $quot{$c} + 0, " quoted\t", $spac{$c} + 0, " spaced\n"; } sub max { ($_[0] > $_[1]) ? $_[0] : $_[1] } sub min { (defined $_[0] && $_[0] < $_[1]) ? $_[0] : $_[1] }