#! /usr/bin/perl -CS use strict; use warnings; sub usage { print STDOUT <<\EoF; Write a file of .sql commands to create from scratch an SQL table containing data and metadata from a given .csv file. Features include: -- We scan each column to determine what SQL data type it requires. -- We allow multi-line headers. -- Flexible scheme for marking the end of the header. -- Field names can come from any line in the header (not necessarily the first line). Usage: :; csv-sql [options] -ifile fname.csv # input file (required) -ofile fname.sql # output file (default based on input file) -table tablename # name of table (default based on input file) -meta 1 # row from which to take field names (default 1) -eoh C:marker # end-of-header marker in column C -eoh marker # end-of-header marker in column A # (default assume just one row of header) -help # print this message (and immediately exit) -debug topic # increase verbosity about topic -debug topics # list the available topics Work that you need to do once: :; mysql -uroot -p mysql create database $dbname; grant all privileges on $dbname.* to $LOGNAME@localhost; flush privileges; where $dbname is some database that can hold the table we create. Work that you need to do once per update: mysql --defaults-extra-file=$HOME/misc/mysql.$dbname.conf source $whatever.sql EoF } use POSIX 'floor'; use Symbol 'gensym'; use Text::CSV_XS 'csv'; use File::Basename; use Scalar::Util 'looks_like_number'; use List::Util 'min', 'max'; use Data::Dumper; # global variables: my %debug; my $ascii_A = ord('A'); sub normalize { my $name = shift; $name =~ s/[-.? ]/_/g; # replace slightly weird characters $name =~ tr/[a-z][A-Z][0-9]_//cd; # eradicate anything weirder return $name; } sub Prefix::new { my $class = shift; $class = ref($class) || $class; return bless { argv => shift, }, $class; } ############## ## utility to help with argument parsing: ## $need==1.1 means "at least" one argument is needed. ## $need==2 means two arguments are needed. sub check_prefix { my ($remaining, $shorter, $longer, $need) = @_; if ($shorter ne substr($longer, 0, length($shorter))) { return 0; } if (defined $need && $need > $remaining) { my $count = floor($need); my $plural = $count==1 ? '' : 's'; my $at_least = $count==$need ? '' : 'at least '; die "Option $longer requires $at_least$count argument$plural.\n"; } return 1; } sub Prefix::check { my $self = shift; return check_prefix(0+$self->{argv}->@*, $self->{arg}, @_); } sub Prefix::next { my $self = shift; $self->{arg} = shift $self->{argv}->@*; } # Convert base26 numerals to/from plain numbers # Examples: # "A" <==> 1 # "Z" <==> 26 # "AA" <==> 27 # "XFD" <==> 16384 sub decode26 { my ($code) = @_; $code =~ tr/a-z/A-Z/; my @bytes = unpack("C*", $code); my $sum = 0; for my $byte (@bytes) { $sum *= 26; my $off = $byte - $ascii_A; if ($off < 0 || $off > 25) { die "decode26: character '$byte' out of range\n"; } $sum += $off + 1; } return $sum; } # support for the -debug option sub get_topics { my %rslt; my $inch = gensym; open ($inch, '<:encoding(UTF-8)', $0) || die "Could not read program '$0'\n"; for my $line (<$inch>) { chomp $line; if ($line =~ m'[$]debug[{]([a-zA-Z]+)[}]') { $rslt{$1}++; } } return \%rslt; } main: { my $pfx = Prefix->new(\@ARGV); sub check {return $pfx->check(@_)} my $meta = 1; my $ifn; my $ofn; my $table; # name of SQL table we are creating my @eoh; # end of header (column, marker) my $topics = get_topics(); while (@ARGV) { $pfx->next; if (check '-help') { usage(); exit(); } elsif (check '-ifile', 1) { $ifn = shift @ARGV; } elsif (check '-ofile', 1) { $ofn = shift @ARGV; } elsif (check '-meta', 1) { $meta = shift @ARGV; } elsif (check '-table', 1) { $table = shift @ARGV; } elsif (check '-eoh', 1) { my $str = shift @ARGV; @eoh = split(':', $str); if (@eoh == 1) { unshift @eoh, 'A'; } } elsif (check '-debug', 1) { my $topic = shift @ARGV; if (! exists $topics->{$topic}) { if ($topic ne 'topics' && $topic !~ m'^[?]') { print STDERR "Unknown topic '$topic'\n" } print STDERR "Available topics are:\n"; for my $top (sort keys %$topics) { print STDERR " $top\n"; } exit 1; } $debug{$topic}++; } elsif ($pfx->{arg} =~ m'^-') { die "Unrecognized option '$pfx->{arg}'\n"; } else { die "Extraneous verbiage '$pfx-{arg}'\n"; } } die "Need a filename\n" if ! defined($ifn); my ($base, $dirs, $sfx) = fileparse($ifn, ".csv"); $ofn //= "$base.sql"; $table = normalize($table // $base); ## print "base: $base dirs: $dirs sfx: $sfx ofn: $ofn table: $table\n"; my $aoa = csv(in => $ifn); my $NR = 0+$aoa->@*; my $header = $aoa->[$meta-1]; my $NC = 0+$header->@*; print join(',', $header->@*), "\n" if $debug{header}; my $ouch = gensym; open($ouch, '>:encoding(UTF-8)', $ofn) || die "Could not open output '$ofn'\n"; # see how many rows are in the header: my $headrows = $meta; ## headrows includes the line that contains the EOH marker, if any if (@eoh) { my $col = decode26($eoh[0]); my $marker = $eoh[1]; my $where; skipper: for (my $ii = 1; $ii < $NR; $ii++) { my $row = $aoa->[$ii]; my $cell = $row->[$col-1]; ## perl is base zero; spreadsheet is base 1 if (defined $cell && $cell eq $marker) { $where = 1 + $ii; last skipper; } } if (! $where) { die "End-of-header marker ('$marker') not found in column $eoh[0] = $col-1\n"; } $headrows = $where; } print "headrows: $headrows\n" if $debug{header}; # See which columns are valid my @valid; for (my $jj = 0; $jj < $NC; $jj++) { my $field = normalize $header->[$jj]; $valid[$jj] = $field ne ''; } # Compute formats: my @formats; for (my $jj = 0; $jj < $NC; $jj++) { my $could_be_num = 1; my $could_be_date = 1; my $len = 0; for (my $ii = $headrows; $ii < $NR; $ii++) { my $value = $aoa->[$ii]->[$jj] // ''; if ($value eq '' || looks_like_number($value)) { # still could be number } else { if ($could_be_num && $debug{number}) { print "datum $header->[$jj] not a number on row $ii '$value'\n"; } $could_be_num = 0; } if ($value eq '' || $value =~ m'[0-9]+/[0-9]+/[0-9]+') { # still could be date } else { if ($could_be_date && $debug{date}) { print "datum $header->[$jj] not a date on row $ii '$value'\n"; } $could_be_date = 0; } $len = max($len, length($value // '')); } if ($could_be_num) { $formats[$jj] = 'float'; } elsif ($could_be_date) { $formats[$jj] = 'date'; } else { $formats[$jj] = "varchar($len)"; } } print $ouch "drop table if exists $table;\n"; print $ouch "create table $table ("; my $comma = ''; definer: for (my $jj = 0; $jj < $NC; $jj++) { next definer if ! $valid[$jj]; print $ouch $comma, "\n"; $comma = ','; my $field = normalize $header->[$jj]; print $ouch "$field $formats[$jj]"; } print $ouch ");\n"; print $ouch "INSERT INTO $table("; $comma = ''; for (my $jj = 0; $jj < $NC; $jj++) { my $field = normalize $header->[$jj]; if ($valid[$jj]) { print $ouch $comma, $field; $comma = ','; } } print $ouch ") values"; my $outcomma = ''; rower: for (my $ii = $headrows; $ii < $NR; $ii++) { my $row = $aoa->[$ii]; print $ouch $outcomma, "\n("; # finish previous row, start new row $outcomma = ','; my $comma = ''; for (my $jj = 0; $jj < $NC; $jj++) { my $field = $row->[$jj]; if ($valid[$jj]) { print $ouch $comma; if (defined $field) { $field =~ s/'/''/g; print $ouch "'$field'"; } else { print $ouch 'null';; } $comma = ','; } } print $ouch ")"; } print $ouch ";\n"; }