#!/usr/bin/env perl

use strict ;
use warnings ;

##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  

##  Eryk Wdowiak
##  original:  30 Aug 2017
##  updated:   27 Feb 2018

##  Perl script to merge: 
##    *  employment status by state -- BLS
##    *  state minimum wage data -- Vaghul and Zipperer 
##    *  average annual pay -- BLS QCEW
##    *  consumer price inflation -- BLS

##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  

##  input files
my $inempl = "originals/BLS_employ-status_USA-states_annual.csv" ; 
my $inminw = "originals/Vaghul-Zipperer_State-Min-Wage_annual.csv" ; 
my $inapay = "originals/BLS_QCEW_annual-pay.txt" ; 
my $incpii = "originals/BLS_consumer-price-inflation.txt" ; 

##  output file
my $otfile = "MinWage-vs-Employment_dataset.csv" ;

##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  

##  hash to hold input
my %indata ;

##  capture the employment status data
my @colsempl = get_csvhdr( $inempl ) ; 
open( INEMPL ,   $inempl  ) || die "could not open $inempl" ;
{ no warnings "uninitialized" ; <INEMPL> while $. < 1 ; }
while (<INEMPL>) {
    chomp; my $line = $_ ;
    my @cols = split( ',' , $line ) ; 
    my %ch ; foreach my $i (0..$#colsempl) { $ch{ $colsempl[$i] } = $cols[$i] ; }
    
    my @empvars = ("civpop","labor_force","lf_part","employed","emp_rate","unemployed","unemp_rate") ;
    foreach my $empvar (@empvars) {
	$indata{$ch{"state"}}{$ch{"year"}}{$empvar} = $ch{$empvar} ; 
    }
}
close INEMPL ; 

##  capture the minimum wage data
my @colsminw = get_csvhdr( $inminw ) ; 
open( INMINW ,   $inminw  ) || die "could not open $inminw" ;
{ no warnings "uninitialized" ; <INMINW> while $. < 1 ; }
while (<INMINW>) {
    chomp; my $line = $_ ;
    my @cols = split( ',' , $line ) ; 
    my %ch ; foreach my $i (0..$#colsminw) { $ch{ $colsminw[$i] } = $cols[$i] ; }
    
    $indata{$ch{"Name"}}{$ch{"Year"}}{"fedrl_minw"} = $ch{"Annual Federal Average"} ; 
    $indata{$ch{"Name"}}{$ch{"Year"}}{"state_minw"} = $ch{"Annual State Average"} ; 
}
close INMINW ; 

##  capture the average annual pay data
##  remember which state we're examining
my $state_now = "" ; 
open( INAPAY , $inapay ) || die "could not open $inapay" ; 
{ no warnings "uninitialized" ; <INAPAY> while $. < 8 ; }
while (<INAPAY>) {
    chomp; my $line = $_ ;
    if ($line =~ /^State:\s+/) { 
	( $state_now = $line ) =~ s/^State:\s+// ; 
    }
    if ($line =~ /^20[01][0-9],/ ) {
	my ($year,$value) = split( ',' , $line ) ; 
	$value =~ s/\(P\)// ; 
	
	$indata{$state_now}{$year}{"avg_annl_pay"} = $value ; 
    }
}
close INAPAY ;

##  capture the CPI data
open( INCPII , $incpii ) || die "could not open $incpii" ; 
{ no warnings "uninitialized" ; 
  <INCPII> while $. < 16 ; 
}
while (<INCPII>) {
    last if $. > 79 ;
    chomp; my $line = $_ ;
    if ($line =~ /^19[789][0-9],/ || $line =~ /^20[01][0-9],/ ) {
	my ($year,$value) = split( ',' , $line ) ; 
	
	$indata{"national"}{$year}{"cpi_index"} = $value ; 
    }
}
{ no warnings "uninitialized" ; 
  <INCPII> while $. < 89 ; 
}
while (<INCPII>) {
    chomp; my $line = $_ ;
    if ($line =~ /^19[789][0-9],/ || $line =~ /^20[01][0-9],/ ) {
	my ($year,$value) = split( ',' , $line ) ; 
	
	  $indata{"national"}{$year}{"cpi_inflation"} = $value ; 
    }
}
close INCPII ; 


##  make list of states and years
my @states = mk_states() ;
my @years = (1976..2016) ;

##  list of variables
my @variables = ("state","year",
		 "civpop","labor_force","lf_part","employed","emp_rate","unemployed","unemp_rate",
		 "fedrl_minw","state_minw","avg_annl_pay","cpi_index","cpi_inflation") ;

##  state dummies 
push( @variables, @states ) ; 

##  year dummies
my @yrnames = @years ; 
s/^/yr/ for @yrnames ;
push( @variables, @yrnames ) ; 

##  concatenate the header
my $otheader = join( "," , @variables ) ; 
$otheader =~ s/\s//g ;

##  write out data
open( OTFILE , ">$otfile" ) || die "could not overwrite $otfile" ;
print OTFILE $otheader . "\n" ; 
foreach my $state (@states) {
    foreach my $year (@years) {

	my @otarray ; 
	push( @otarray , $state ) ; 
	push( @otarray , $year ) ; 

	foreach my $variable ("civpop","labor_force","lf_part","employed","emp_rate","unemployed","unemp_rate",
			      "fedrl_minw","state_minw","avg_annl_pay") {
	    no warnings "uninitialized" ; 
	    push( @otarray , $indata{$state}{$year}{$variable} ) ; 

	} 
	push( @otarray , $indata{"national"}{$year}{"cpi_index"}     ) ;  
	push( @otarray , $indata{"national"}{$year}{"cpi_inflation"} ) ; 

	## state dummy
	my @stateDumAry = grep {$states[$_] =~ /^$state$/} 0..$#states ; 
	my @stateDums   = dumtbl( $stateDumAry[0] , $#states + 1 ) ; 
	push( @otarray , @stateDums ) ; 

	## year dummy
	my @yearDumAry = grep {$years[$_] =~ /^$year$/} 0..$#years ; 
        my @yearDums = dumtbl( $yearDumAry[0] , $#years + 1 ) ;
	push( @otarray , @yearDums ) ; 

	## prepare output string
	my $otline ; 
	{  no warnings "uninitialized" ; 
	   $otline = join( "," , @otarray ) ; 
	}
	
	## print to output file
	print OTFILE $otline . "\n" ; 
    }
} 
close OTFILE ; 

##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  

##  SUBROUTINES
##  ===========

##  make an array of states
sub mk_states {
    
    ##  list of states
    my $state_string ; 
    $state_string .= 'Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware';
    ## $state_string .= '|District of Columbia';
    $state_string .= '|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|';
    $state_string .= 'Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|';
    $state_string .= 'Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|';
    $state_string .= 'New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|';
    $state_string .= 'Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|';
    $state_string .= 'Virginia|Washington|West Virginia|Wisconsin|Wyoming';
    ## $state_string .= '|Los Angeles County|New York city';
    ## $state_string .= '|Virgin Islands|Puerto Rico';
    
    my @state_array = split( '\|' , $state_string ) ; 
    return @state_array ;
}

##  get the header of a CSV file
sub get_csvhdr {
    
    my $csvfile = $_[0] ; 

    ## get header row
    open (CSVFILE, $csvfile ) || die "could not open $csvfile";
    chomp( my $header = <CSVFILE> ) ; 
    close CSVFILE ;
    
    ## split the header for use as column names
    my @colnames = split( /,/ , $header ) ; 
    
    return @colnames ; 
}

##  checks for digits, integers and floats
sub is_digits {
    my $inval = $_[0] ; 
    defined $inval && $inval =~ /^\d+$/;
}

sub is_integer {
    my $inval = $_[0] ; 
    defined $inval && $inval =~ /^[+-]?\d+$/;
}

sub is_float {
    my $inval = $_[0] ; 
    defined $inval && $inval =~ /^[+-]?\d+(\.\d+)?$/;
}

## let's define an R-like "rep" function 
sub rep {

    my $want  = $_[0]     ;
    my $times = $_[1] - 1 ; 

    my @otarray ; 

    for my $i (0..$times) {
	$otarray[$i] = $want ; 
    }
    
    return @otarray ; 
}

## function to make a dummy table 
sub dumtbl {

    my $variable = $_[0]  ; 
    my $possible = $_[1]  ; 

    my @otarray ; 

    if ( ! is_integer($variable) ) { 
	@otarray = rep( "NA" , $possible ) ; 

    } else { 
	@otarray = rep( 0 , $possible ) ; 
	$otarray[$variable] = 1 ; 
    } 

    ## make sure that no more than 0..$lessone returned 
    my $lessone = $possible - 1 ; 
    return @otarray[0..$lessone] ; 
}

