RA Coding Exercise

Question 1

I don’t think we need to include the main effect term for Import Shock because if the goal of the regression is to see if RD tax credit can absorb import shocks, we have already controlled for import shocks in the interaction terms. Including the main effect term will only change the constant terms or the interpretation of the other coefficients.

Question 2

library(haven)
library(tidyverse)
library(stringr)
library(formattable)
library(plm)

compustat <- read_dta('compustat.dta')
patent <- read_dta('pdpcohdr.dta')
inventor <- read_dta('inventors.dta')
tax_credit <- read_dta('rd_tax_credit_state.dta')
patassg <- read_dta('patassg.dta')
dynass <- read_dta('dynass.dta')

2.1

gvkey <- compustat$gvkey
gvkey <- str_remove(gvkey, "^0+") #remove the leading 0s to merge
compustat$gvkey <- gvkey
compustat_patent <- merge(compustat, patent, by = 'gvkey')
compustat_patent$gvkey <- as.numeric(compustat_patent$gvkey)
compustat_patent <- compustat_patent %>% select(-2,-12:-14,-18:-20,-23:-25,-33) #drop some cols
formattable(head(compustat_patent,10))

gvkey	fyear	indfmt	conm	fyr	at	capx	ebitda	ppent	sale	sic	state	sic4	year	cusip	firstyr	lastyr	pdpco	pdpseq	begyr	endyr
10000	2011	INDL	STANDARD MOTOR PRODS	12	550.722	11.037	75.847	64.039	874.625	3690	NY	3690	2011	853666	1960	2006	10000	1	1960	2006
10000	2014	INDL	STANDARD MOTOR PRODS	12	673.551	13.904	113.400	64.611	980.392	3690	NY	3690	2014	853666	1960	2006	10000	1	1960	2006
10000	2018	INDL	STANDARD MOTOR PRODS	12	843.132	20.141	105.555	90.754	1092.051	3690	NY	3690	2018	853666	1960	2006	10000	1	1960	2006
10000	2010	INDL	STANDARD MOTOR PRODS	12	492.801	10.806	62.873	60.666	810.910	3690	NY	3690	2010	853666	1960	2006	10000	1	1960	2006
10000	2009	INDL	STANDARD MOTOR PRODS	12	484.459	7.174	46.082	61.478	735.424	3690	NY	3690	2009	853666	1960	2006	10000	1	1960	2006
10000	1997	INDL	STANDARD MOTOR PRODS	12	577.137	15.597	28.435	126.024	559.823	3690	NY	3690	1997	853666	1960	2006	10000	1	1960	2006
10000	2004	INDL	STANDARD MOTOR PRODS	12	656.569	9.774	35.154	97.425	824.283	3690	NY	3690	2004	853666	1960	2006	10000	1	1960	2006
10000	2008	INDL	STANDARD MOTOR PRODS	12	575.027	10.500	32.657	66.901	775.241	3690	NY	3690	2008	853666	1960	2006	10000	1	1960	2006
10000	1991	INDL	STANDARD MOTOR PRODS	12	392.755	12.049	33.407	96.203	534.808	3690	NY	3690	1991	853666	1960	2006	10000	1	1960	2006
10000	1999	INDL	STANDARD MOTOR PRODS	12	556.021	14.423	46.774	106.578	658.241	3690	NY	3690	1999	853666	1960	2006	10000	1	1960	2006

2.2

#merge inventors to patents to firms
inventor <- inventor %>% rename(patnum = patent) %>% filter(country == 'US')
patent2 <- inner_join(inventor, patassg, by = 'patnum')
patent3 <- inner_join(patent2, dynass, by = 'pdpass')
patent3 <- patent3 %>% select(-6,-8,-10,-19:-34) #drop unnecessary columns
patent3 <- patent3 %>% rename(gvkey = pdpco1)
compustat_pat_inv <- inner_join(compustat_patent, patent3, by = 'gvkey')
compustat_pat_inv <- compustat_pat_inv %>% rename(state = state.y)
compustat_pat_inv <- compustat_pat_inv %>% select(-3,-5:-6,-15:-18,-21:-22,-28:-36)
formattable(head(compustat_pat_inv, 10))

gvkey	fyear	conm	capx	ebitda	ppent	sale	sic	state.x	sic4	year	pdpco	pdpseq	patnum	firstname	lastname	state	country
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3934385	AARON J	UNGERER	NY	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3934385	PETER V	PAULUS	MI	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3944693	AARON J	UNGERER	NY	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3959538	THEODORE	LOEW	NY	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3959537	THEODORE	LOEW	NY	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3958383	JOHN A	DALLEN	OH	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3959941	LEONARD W	SMITH	OH	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	3970498	THEODORE	LOEW	NY	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	4044177	PETER V	PAULUS	MI	US
10004	1999	STANDARD PRODUCTS CO	67.214	122.344	359.624	1083.785	3714	MI	3714	1999	10004	1	4044177	THEODORE	LOEW	NY	US

2.3

#count the number of inventors for at firm, state, year level
inventor_count <- compustat_pat_inv %>% 
  group_by(conm, year, state, sic) %>%                        
  summarise(count_inv = sum(pdpseq)) #use pdpseq to count because this is a column of all 1s

#count the number of state-total inventors at firm, year level
inventor_count_sum <- inventor_count %>% group_by(conm, year) %>% summarise(sum_count = sum(count_inv))
inventor_share <- inner_join(inventor_count, inventor_count_sum, by = c('conm','year'))

#caculate the share of inventors across states for each firm. Here, I cannot calculate the 10-year rolling window though
inventor_share <- inventor_share %>% mutate(share = round(count_inv/sum_count,2))

#merge back to the dataset
inventor_share <- inner_join(inventor_share, compustat_patent, by = c('year','conm'))
inventor_share <- inventor_share %>% select(1:8,13,15) %>% rename(state = state.x, sic = sic.x) %>% ungroup()
formattable(head(inventor_share, 10))

conm	year	state	sic	count_inv	sum_count	share	gvkey	capx	ppent
1ST PRESTIGE WEALTH MGMT INC	2003	AR	6200	1	13	0.08	65525	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	MA	6200	3	13	0.23	65525	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	NV	6200	1	13	0.08	65525	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	NY	6200	8	13	0.62	65525	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2007	AR	6200	1	13	0.08	65525	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	MA	6200	3	13	0.23	65525	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	NV	6200	1	13	0.08	65525	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	NY	6200	8	13	0.62	65525	0.156	0.143
24/7 REAL MEDIA INC	1996	VA	7370	2	2	1.00	113498	1.537	1.679
24/7 REAL MEDIA INC	1997	VA	7370	2	2	1.00	113498	0.042	0.591

2.4

#change state full name to abbreviations to merge
state_name <- tax_credit$statename
state <- state.abb[match(state_name,state.name)]
tax_credit$state <- state
compustat_pat_tax <- inner_join(inventor_share, tax_credit, by = c('state','year'))
formattable(head(compustat_pat_tax, 10))

conm	year	state	sic	count_inv	sum_count	share	gvkey	capx	ppent	statename	rd_taxcredit
1ST PRESTIGE WEALTH MGMT INC	2003	AR	6200	1	13	0.08	65525	0.000	0.009	Arkansas	1.199619
1ST PRESTIGE WEALTH MGMT INC	2003	MA	6200	3	13	0.23	65525	0.000	0.009	Massachusetts	1.136985
1ST PRESTIGE WEALTH MGMT INC	2003	NV	6200	1	13	0.08	65525	0.000	0.009	Nevada	1.155769
1ST PRESTIGE WEALTH MGMT INC	2003	NY	6200	8	13	0.62	65525	0.000	0.009	New York	1.206913
1ST PRESTIGE WEALTH MGMT INC	2007	AR	6200	1	13	0.08	65525	0.156	0.143	Arkansas	1.199619
1ST PRESTIGE WEALTH MGMT INC	2007	MA	6200	3	13	0.23	65525	0.156	0.143	Massachusetts	1.136985
1ST PRESTIGE WEALTH MGMT INC	2007	NV	6200	1	13	0.08	65525	0.156	0.143	Nevada	1.155769
1ST PRESTIGE WEALTH MGMT INC	2007	NY	6200	8	13	0.62	65525	0.156	0.143	New York	1.203977
24/7 REAL MEDIA INC	1996	VA	7370	2	2	1.00	113498	1.537	1.679	Virginia	1.196031
24/7 REAL MEDIA INC	1997	VA	7370	2	2	1.00	113498	0.042	0.591	Virginia	1.196031

2.5

compustat_weighted_tax <- compustat_pat_tax %>% group_by(conm, year) %>% 
  summarize(weighted_tax = sum(share*rd_taxcredit)) %>% ungroup() #calculate the weighted tax rate
#merge back to the dataset
compustat_weighted_tax <- inner_join(compustat_weighted_tax, compustat_pat_tax, by = c('year','conm'))
compustat_weighted_tax <- compustat_weighted_tax %>% select(1:5,10,11)
formattable(head(compustat_weighted_tax,10))

conm	year	weighted_tax	state	sic	capx	ppent
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	AR	6200	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	MA	6200	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	NV	6200	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	NY	6200	0.000	0.009
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	AR	6200	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	MA	6200	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	NV	6200	0.156	0.143
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	NY	6200	0.156	0.143
24/7 REAL MEDIA INC	1996	1.196031	VA	7370	1.537	1.679
24/7 REAL MEDIA INC	1997	1.196031	VA	7370	0.042	0.591

Question 3

3.1

cw <- read_dta('cw_hs6_sic87dd.dta')
trade <- read_dta('import_china_HS_classification.dta')
cw <- cw %>% rename(productcode = commoditycode)
merge_trade_cw <- merge(cw, trade, by = 'productcode')
formattable(head(merge_trade_cw,10))

productcode	sic87dd	share	tradevaluein1000usd	quantity
100190	111	1	23.928	4400
100190	111	1	38.820	40735
100190	111	1	12.348	2520
100190	111	1	323.681	362698
100190	111	1	1.705	1812
100190	111	1	3.059	1600
100190	111	1	5.720	1749
100190	111	1	69.281	199426
100190	111	1	341.591	445091
100300	119	1	29.897	18238

3.2

#remove the last 0 digit if there are 4 digits 
vec <- compustat_weighted_tax$sic #store the column values in a vector
new_sic <- str_remove(vec, "0$") #remove last 0 digit

#longer way: for loop to remove the last 0 digit
#new_sic <- c()
#counter = 1
#for (i in vec){
  #if (str_sub(i, str_length(i), str_length(i)) == '0'){
    #new_sic[counter] <- str_sub(i, 1, str_length(i)-1)
    #counter = counter + 1
  #} else {
    #new_sic[counter] <- i
    #counter = counter + 1
  #}
#}

#create a new column after having removed the 0 digit
compustat_weighted_tax$sic87 <- as.numeric(new_sic)
write_dta(compustat_weighted_tax, 'merged_data_1.dta') #save the data file to use the dofile in Stata
formattable(head(compustat_weighted_tax,10))

conm	year	weighted_tax	state	sic	capx	ppent	sic87
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	AR	6200	0.000	0.009	620
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	MA	6200	0.000	0.009	620
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	NV	6200	0.000	0.009	620
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	NY	6200	0.000	0.009	620
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	AR	6200	0.156	0.143	620
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	MA	6200	0.156	0.143	620
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	NV	6200	0.156	0.143	620
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	NY	6200	0.156	0.143	620
24/7 REAL MEDIA INC	1996	1.196031	VA	7370	1.537	1.679	737
24/7 REAL MEDIA INC	1997	1.196031	VA	7370	0.042	0.591	737

#Data after converted by Do-file
data <- read_dta('compustat_sicmod.dta')
#final_data <- inner_join(data, merge_trade_cw, by = 'sic87dd') : takes too long to run
#Here, I use Stata to merge the file instead because it takes a long time to merge in R
import <- read_dta('import_data.dta') #file after m:m merged in Stata
formattable(head(import,10))

conm	year	weighted_tax	state	sic	capx	ppent	sic87	sic87dd	productcode	share	weights_method	tradevaluein1000usd	quantity
HILLSHIRE BRANDS CO	2008	1.192283	MS	2013	454.000	2519.000	2013	2011	410130	1.00000000	0	44.607	3432
HILLSHIRE BRANDS CO	1996	1.203889	AL	2013	542.000	3007.000	2013	2011	410121	1.00000000	0	0.480	195
HILLSHIRE BRANDS CO	2009	1.192191	VA	2013	357.000	2356.000	2013	2011	410121	1.00000000	0	4.598	2026
BRIDGFORD FOODS CORP	1998	1.132321	CA	2013	2.285	16.197	2013	2011	410129	1.00000000	0	159.901	31605
TYSON FOODS INC -CL A	2002	1.189733	FL	2011	506.000	4038.000	2011	2011	160100	1.00000000	0	8.626	990
HILLSHIRE BRANDS CO	2006	1.193954	WI	2013	625.000	2942.000	2013	2011	410110	1.00000000	0	214.301	73027
HILLSHIRE BRANDS CO	2002	1.185731	SC	2013	669.000	3155.000	2013	2011	391710	0.95735645	1	208.218	15014
HORMEL FOODS CORP	1990	1.202108	CT	2011	34.988	235.026	2011	2011	410121	1.00000000	0	12.339	2875
HILLSHIRE BRANDS CO	2011	1.190193	GA	2013	337.000	1648.000	2013	2011	410129	1.00000000	0	93.065	27906
HORMEL FOODS CORP	2008	1.212460	IA	2011	125.890	977.657	2011	2011	510119	0.09350958	1	4.061	133

3.3

import_state_year <- import %>% 
  group_by(state, year) %>%   #sum by state and year                     
  summarise(sum_trade = sum(tradevaluein1000usd)) %>% drop_na()

final_data <- inner_join(compustat_weighted_tax, import_state_year, by = c('year','state'))
formattable(head(final_data,10))

conm	year	weighted_tax	state	sic	capx	ppent	sic87	sum_trade
1ST PRESTIGE WEALTH MGMT INC	2003	1.198223	AR	6200	0.000	0.009	620	25890618
1ST PRESTIGE WEALTH MGMT INC	2007	1.196403	AR	6200	0.156	0.143	620	25285322
3-DIMENSIONAL PHARMACEUTICAL	2001	1.151313	DE	2834	9.090	11.735	2834	35417462
3M CO	1990	1.162646	AL	2670	1337.000	4389.000	267	57134235
3M CO	1990	1.162646	AR	2670	1337.000	4389.000	267	68336776
3M CO	1990	1.162646	DE	2670	1337.000	4389.000	267	26059905
3M CO	1990	1.162646	IA	2670	1337.000	4389.000	267	15579231
3M CO	1990	1.162646	LA	2670	1337.000	4389.000	267	23275225
3M CO	1990	1.162646	MS	2670	1337.000	4389.000	267	5561636
3M CO	1990	1.162646	MT	2670	1337.000	4389.000	267	11325592

Question 4

I cannot do this part, both because some of the above parts may be incorrect and because I am very inexperienced with panel data regression. I have tried the below code but it gives error output for the regression.

#demean the variable
#sample_mean_data <- final_data %>% group_by(conm) %>% summarize(sample_mean = mean(sum_trade))
#data_before_demean <- inner_join(final_data, sample_mean_data, by = 'conm')
#demeaned_data <- data_before_demean %>% mutate(demeaned_trade = sum_trade - sample_mean)

#create the dependent variable
#final_data_reg <- demeaned_data %>% mutate(investment = capx/ppent)

#model <- plm(investment ~ weighted_tax*demeaned_trade + weighted_tax + as.factor(year), data = final_data_reg, index=c('year', 'state'), model="with", effect="twoways")