1. Constructing R&D stock data

The R&D stock data is constructed following the perpetual inventory method. The R&D spending flows series are extracted from OECD’s ANBERD database.

R&D Data

rnd <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/anberd.csv") 
rnd <- rnd %>% select(LOCATION, Country, IND, Industry, TIME, Value, Measure, Flag.Codes)
colnames(rnd) <- c("country_iso", "country_name", "industry_code", "industry_name", "year", "rd_spending", "unit", "estimated")
#rnd2 <- rnd %>% expand(country_iso, industry_code, year), not working, not worrying about this rn


#Creating R&D stock measures : first I compute yearly growth rates for spending, then I set the starting value at t = 2000, and compute the stock for each year as the non-depreciated stock from period t-1 + t-1 spending. The depreciation rate is uniformily set to 0.1. 
rnd <- rnd %>%
    filter(unit == "PPP dollars - 2015 prices") %>% 
    group_by(country_iso, industry_code) %>%
    arrange(year) %>%
    mutate(grd = ifelse(year > 2000, (rd_spending - lag(rd_spending))/lag(rd_spending), NA)) %>% 
    mutate(meangrd = mean(grd, na.rm = TRUE)) %>% 
    mutate(rdstock = ifelse(year == 2000, rd_spending / (meangrd + 0.1), 0)) %>% 
    mutate(rdstock = ifelse(year > 2000, 0.9*lag(rdstock) + lag(rd_spending), rdstock))

write.csv(rnd, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/RnD.csv")

#Visualizing stocks for G5 countries 
p1 <- rnd %>% group_by(country_iso, year) %>% summarize(rdstock = sum(rdstock)) %>% ggplot(aes(x = year, y = rdstock)) 

p1 + geom_bar(stat="identity", width = .5, colour="black", fill="white") + facet_grid(. ~ country_iso)

2. Constructing TFP

This exercise follows as much as possible the approach from Keller 2002. However, the capital stock adjustment performed in Keller is not replicated, and the labor shares used here are income rather than cost based.

2.1. Initial attempt

stan <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/stan.csv")
stan <- stan %>% select(LOCATION, Country, IND, Industry, Time, VAR, Variable, Value, Unit.Code, PowerCode, Reference.Period.Code)
colnames(stan) <- c("country_iso", "country_name", "industry_code", "industry_name", "year", "variable_code", "variable_name", "value", "currency", "unit", "reference_year")

#Generating key variables
#1. Value added at constant 2010 USD : Generate constant local prices series, then divide by 2010 value to get index. Multiply index by series expressed in USD using 2010 conversion rates. 
library(dplyr)
exr <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/PPPXrate.csv")
exr <- exr %>% 
  select(LOCATION, INDICATOR, TIME, Value) %>% 
  dplyr::rename("country_iso" = "LOCATION", "indicator" = "INDICATOR", "year" = "TIME", "xr" = "Value") %>%
  filter(year == 2010) %>% 
  select(-year)
  
va1 <- stan %>% 
  right_join(exr, by = "country_iso") %>%
  filter(variable_code == "VALU") %>% 
  dplyr::rename(value_added = value) %>%
  select(country_iso, industry_code, year, value_added, xr)

va2 <- stan %>% 
  filter(variable_code == "VALP") %>% #VA deflator 
  dplyr::rename(va_deflator = value) %>% 
  select(country_iso, industry_code, year, va_deflator)


va <- left_join(va1, va2, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(cpva = value_added*100/va_deflator) %>%
  group_by(country_iso, industry_code) %>% 
  arrange(year) %>%
  mutate(index = cpva/cpva[11]) %>% 
  mutate(rva = index*value_added*xr) %>% 
  mutate(log_rva = log(rva)) %>% 
  group_by(country_iso, year) %>%
  mutate(mean_log_rva = mean(log_rva, na.rm = T))

#2. Generate labor input term 

lab1 <- stan %>% 
  filter(variable_code == "EMPN") %>% 
  dplyr::rename(tot_employed = value) %>% 
  select(country_iso, industry_code, year, tot_employed)

lab2 <- stan %>% 
  filter(variable_code == "HRSN" ) %>% 
  dplyr::rename(hrs_worked = value) %>% #Check description in STAN database
  select(country_iso, industry_code, year, hrs_worked)

lab <- left_join(lab1, lab2, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(lab = tot_employed*hrs_worked) %>% 
  mutate(log_labor = log(lab)) %>% 
  group_by(country_iso, year) %>% 
  mutate(mean_log_labor = mean(log_labor, na.rm = T))

#3. Capital : take gross fixed capital in current prices, use gross fixed capital formation deflator to generate constant local price series, divide by 2010 to get index. Multiply index by series expressed in USD using 2010 conversion rates. This gives the flow. Then I create the stock using inventory method. Industry specific capital depreciation estimates taken from the BEA estimates. 
cap1 <- stan %>% 
  right_join(exr, by = "country_iso") %>%
  filter(variable_code == "GFCF") %>% 
  dplyr::rename(cap_flow = value) %>% 
  select(country_iso, industry_code, year, cap_flow, xr)

cap2 <- stan %>% 
  filter(variable_code == "GFCP") %>% #capital formation deflator 
  dplyr::rename(cap_deflator = value) %>% 
  select(country_iso, industry_code, year, cap_deflator)


cap <- left_join(cap1, cap2, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(cpcap = cap_flow*100/cap_deflator) %>% 
  group_by(country_iso, industry_code) %>% 
  arrange(year) %>%
  mutate(index = cpcap/cpcap[11]) %>% 
  mutate(rcapflow = index*cap_flow*xr) 

#Now have real flow measure, now need to create stock measure. first I create a dataframe of capital depreciation rate by industry. Then I use the perpetual inventory method to compute the stock at every time t 

deprates <- stan %>% 
  distinct(industry_code, industry_name) %>% #from here https://apps.bea.gov/national/pdf/BEA_depreciation_rates.pdf, two decimals, where need consolidation takes average. 50% special industry machinery, 50% computer and peripheral equipment. rounded. 
  cbind(c(NA, 0.09, 0.12, 0.15, 0.11, 0.12, 0.11, 0.13, 0.1, 0.08, 0.13, 0.13, 0.07, 0.12, 0.12)) 
colnames(deprates) <- c("industry_code", "industry_name", "dep_rate")
deprates <- deprates %>% select(industry_code, dep_rate)


cap <- cap %>%
    left_join(deprates, by = "industry_code") %>% 
    group_by(country_iso, industry_code) %>%
    arrange(year) %>%
    mutate(gcf = ifelse(year > 2000, (rcapflow - lag(rcapflow))/lag(rcapflow), NA)) %>% 
    mutate(meangcf = mean(gcf, na.rm = TRUE)) %>% 
    mutate(capstock = ifelse(year == 2000, rcapflow / (meangcf + dep_rate), 0)) %>% 
    mutate(capstock = ifelse(year > 2000, (1-dep_rate)*lag(capstock) + lag(rcapflow), capstock)) %>% 
    mutate(log_capstock = log(capstock)) %>% 
    group_by(country_iso, industry_code) %>%
    mutate(mean_log_capstock = mean(log_capstock, na.rm = T))

#4. Adjusting capital : Skipped 

#5. Computing income share of labor, and of capital 
share1 <- stan %>% 
  filter(variable_code == "LABR") %>% 
  dplyr::rename(labor_costs = value) %>% 
  select(country_iso, industry_code, year, labor_costs)

share2 <- stan %>% 
  filter(variable_code == "PROD") %>% 
  dplyr::rename(production = value) %>% 
  select(country_iso, industry_code, year, production)

share <- left_join(share1, share2, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(lab_share = labor_costs/production) %>% 
  mutate(cap_share = 1 - lab_share)


# 6. Computing TFP  
a <- left_join(va, lab, by = c("country_iso", "industry_code", "year"))
b <- left_join(cap, share, by = c("country_iso", "industry_code", "year"))

dependent_var <- left_join(a, b, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(log_fcit = (log_rva - mean_log_rva) - lab_share*(log_labor - mean_log_labor) - cap_share*(log_capstock - mean_log_capstock))

#7. Plotting constructed outcome variable 
ggplot(dependent_var, aes(x=log_fcit)) + 
    geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666") + # Overlay with transparent density plot
    geom_vline(aes(xintercept=mean(log_fcit, na.rm=T)),   # Ignore NA values for mean
               color="red", linetype="dashed", size=1) + ggtitle("Distribution of log TFP (unadjusted)") +
  xlab("log(TFP)") + ylab("Density")

write.csv(dependent_var, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")

2.2. Troubleshooting missing values

When it comes to the constructed TFP measure, I have a total of 1243 missing values out of 3410 observations. Over a third of values are missing. Below I break down the sources of missing values.

\(rva_t = \frac{value\_added_t*100}{va\_deflator_t}\times \frac{va\_deflator_{2015}}{value\_added_{2015}*100}*value\_added*xr\)

The value added deflator is missing in 234 instances, which generate 242 missing values for my real value added measure.

\(capstock_t = (1-deprate)\times capstock_{t-1} + rcapflow_{t-1}\) Where, \(capstock_{t=0} = \frac{rcapflow_{t=0}}{meangcf + deprate}\) And $rcapflow = cap_flow_t xr $

The capital stock measure is missing in deflator is missing in 860 instances. This is 405 instances. It’s missing in 40 lines for 2015, with rippling effects (through the time series dimension of the panel) due to series being normalized by its 2015 value.

The third source of missing values is the labor input measure. This measure is defined as follows:

\(labor = total\_employed \times hours\_worked\)

There are 753 missing values, all of which are due to unreported hours worked.

One solution for delfators can be to keep the current series and replace missing value by an approximation. Deflators depend on : global and local variations in input prices, and local price inflation and monetary aggregates. To the extent that the main drivers of price dynamics are experienced more uniformily within a country than within an industry, we can propose the following solution. We can use the average deflator across available industries in a given country (this requires that at least one deflator is reported by country by year).

The number of hours worked is also likely to depend on geographic factors more than industrial factors. Laws regulating work hours, holidays, time-off, etc. are locally determined. Therefore, a possible approximation for missed hours worked could be the cross-industry country average for the given year.

Finally, the income shares could not be computed for all industries in all countries in all years because production series is missing in some 40 instances. Again, I propose to replace the labor shares for industries where its missing by using the local average because I believe that local dynamics and regulations matter more than sector characteristics.

Below I implement these revisions.

adjusted_va <- left_join(va1, va2, by = c("country_iso", "industry_code", "year")) %>% 
  group_by(country_iso, year) %>% 
  mutate(average_deflator = mean(va_deflator, na.rm = T)) %>%
  ungroup() %>%
  mutate(adj_va_deflator = ifelse(is.na(va_deflator), average_deflator , va_deflator)) %>% 
  mutate(cpva = value_added*100/adj_va_deflator) %>% 
  group_by(country_iso, industry_code) %>% 
  arrange(year) %>%
  mutate(index = cpva/cpva[11]) %>% 
  mutate(rva = index*value_added*xr) %>% 
  mutate(log_rva = log(rva)) %>% 
  group_by(country_iso, year) %>%
  mutate(mean_log_rva = mean(log_rva, na.rm = T))
#Still missing values which dont make sense, reduced by 84% tho. 


adjusted_cap <- cap %>% 
  group_by(country_iso, year) %>% 
  mutate(average_cap_deflator = mean(cap_deflator, na.rm = T)) %>%
  ungroup() %>%
  mutate(adj_cap_deflator = ifelse(is.na(cap_deflator), average_cap_deflator , cap_deflator)) %>% 
  mutate(cpcap = cap_flow*100/adj_cap_deflator) %>% 
  group_by(country_iso, industry_code) %>% 
  arrange(year) %>%
  mutate(index = cpcap/cpcap[11]) 
  
  
adjusted_cap <- adjusted_cap %>% 
  ungroup() %>%
  mutate(rcapflow = index*cap_flow*xr) %>%
    group_by(country_iso, industry_code) %>%
    arrange(year) %>%
    mutate(gcf = ifelse(year > 2000, (rcapflow - lag(rcapflow))/lag(rcapflow), NA)) %>% 
    mutate(meangcf = mean(gcf, na.rm = TRUE)) %>% 
    mutate(capstock = ifelse(year == 2000, rcapflow / (meangcf + dep_rate), 0)) %>% 
    mutate(capstock = ifelse(year > 2000, (1-dep_rate)*lag(capstock) + lag(rcapflow), capstock)) %>% 
    mutate(log_capstock = log(capstock)) %>% 
    group_by(country_iso, industry_code) %>%
    mutate(mean_log_capstock = mean(log_capstock, na.rm = T))
#28 flows missing but 265 stocks missing because compounded. so maybe change stock formula to account for missing values. 

adjusted_lab <- lab %>% 
  group_by(country_iso, year) %>% 
  mutate(average_hrs_worked = mean(hrs_worked, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(adj_hrs_worked = ifelse(is.na(hrs_worked), average_hrs_worked, hrs_worked)) %>%
  mutate(lab = tot_employed*adj_hrs_worked) %>% 
  mutate(log_labor = log(lab)) %>% 
  group_by(country_iso, year) %>% 
  mutate(mean_log_labor = mean(log_labor, na.rm = T))

#Japan and australia do not report hours worked,. for japan this is not a problem because we do not need japan's TFP, but for australia it is... i could amputate averages across countries. 


adjusted_share <- share %>% 
  group_by(country_iso, year) %>% 
  mutate(average_lab_share = mean(lab_share, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(lab_share = ifelse(is.na(lab_share), average_lab_share, lab_share)) %>% 
  mutate(cap_share = 1-lab_share) 

#Reduced from 45 to 38, not resolved 

c <- left_join(adjusted_va, adjusted_lab, by = c("country_iso", "industry_code", "year"))
d <- left_join(adjusted_cap, adjusted_share, by = c("country_iso", "industry_code", "year"))

naadj_dependent_var <- left_join(c, d, by = c("country_iso", "industry_code", "year")) %>% 
  mutate(log_fcit = (log_rva - mean_log_rva) - lab_share*(log_labor - mean_log_labor) - cap_share*(log_capstock - mean_log_capstock))

#NAs down to 1243 from 705. Left over limited to specific countries with reporting issues. 

write.csv(naadj_dependent_var, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")

#Distribution of adjusted variable 
ggplot(naadj_dependent_var, aes(x=log_fcit)) + 
    geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666") + # Overlay with transparent density plot
    geom_vline(aes(xintercept=mean(log_fcit, na.rm=T)),   # Ignore NA values for mean
               color="red", linetype="dashed", size=1) + ggtitle("Distribution of log TFP (after NA adjustment)") +
  xlab("log(TFP)") + ylab("Density")

The adjustments allow us to recover approximations for 538 of the missing values. We are however still looking at 705 missing.

2.3. Testing validity of TFP measure

I run two tests to sanity-check the constructed TFP measure. Two dimensions can be used to compare: the national level TFP and the industry level TFP.

First I check the correlation between national average TFP measures of the 14 industries against cross-country TFP from the OECD.

Second, I check the correlation between industry levels TFP and observed TFP in the US as estimated by BLS.

#Country level 
constructed <- dependent_var %>% filter(industry_code != "D10T33") %>% select(country_iso, year, log_fcit) %>% filter(log_fcit != "NaN") %>% group_by(country_iso, year) %>% summarize(log_fcit = mean(log_fcit, na.rm = T)) 
country_level <- read.csv("/Users/yab/Downloads/DP_LIVE_15062020043109086.csv")
country_level <- country_level %>% select(LOCATION, TIME, Value) %>% dplyr::rename(country_iso = LOCATION, year = TIME, tfp_b2010 = Value)
country_level_corr <- left_join(constructed, country_level, by = c("country_iso", "year")) %>% group_by(country_iso) %>% summarize(cor(log_fcit, tfp_b2010))

constructed <- naadj_dependent_var %>% filter(industry_code != "D10T33") %>% select(country_iso, year, log_fcit) %>% filter(log_fcit != "NaN") %>% group_by(country_iso, year) %>% summarize(log_fcit = mean(log_fcit, na.rm = T)) 
country_level <- read.csv("/Users/yab/Downloads/DP_LIVE_15062020043109086.csv")
country_level <- country_level %>% select(LOCATION, TIME, Value) %>% dplyr::rename(country_iso = LOCATION, year = TIME, tfp_b2010 = Value)
country_level_corr_adj <- left_join(constructed, country_level, by = c("country_iso", "year")) %>% group_by(country_iso) %>% summarize(cor(log_fcit, tfp_b2010))

library("kableExtra")

kable(country_level_corr, caption = "Country-level correlations: Constructed TFP vs. OECD Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))
Country-level correlations: Constructed TFP vs. OECD Measure
country_iso cor(log_fcit, tfp_b2010)
CAN 0.5282437
DEU 0.5115391
DNK 0.3193141
FIN 0.6656947
FRA 0.6935171
GBR 0.8457201
ITA -0.6648197
NLD 0.6121941
NOR NA
SWE 0.7535911
USA 0.5475565
kable(country_level_corr_adj, caption = "Country-level correlations: Constructed TFP vs. OECD Measure (NA Adjusted)") %>% kable_styling(bootstrap_options = c("striped", "hover"))
Country-level correlations: Constructed TFP vs. OECD Measure (NA Adjusted)
country_iso cor(log_fcit, tfp_b2010)
CAN 0.5282437
DEU 0.4753621
DNK 0.3193141
ESP -0.7656644
FIN 0.6656947
FRA 0.6979906
GBR 0.8369423
ITA -0.6423458
NLD 0.6121941
NOR NA
SWE 0.7335012
USA 0.5475565
#Industry level 

industry_level <- read.csv("/Users/yab/Downloads/three_digit_industry_mfp_data.csv") #source : https://www.bls.gov/lpc/tables_by_sector_and_industry.htm
industry_level <- industry_level %>% select(Three.Digit.Manufacturing.Industries, Year, Multifactor.Productivity..2012...100.) %>% filter(Year > 1999 & Year < 2017)
colnames(industry_level) <- c("industry", "year", "tfp_b2012") #in the next paragraph i code the industries

industries <- industry_level %>% distinct(industry) 
codes <- c("D10T12", "D13T15", "D13T15", "D16", "D17", "D18", "REM", "D20T21", "D22", "D23", "D24T25", "D24T25", "D28", "D26", "D27", "D29T30", "D31T32", "D31T32")
industries <- cbind(industries, codes)
colnames(industries) <- c("industry", "industry_code")
industry_level <- left_join(industry_level, industries) %>% group_by(industry_code, year) %>% summarize(tfp_b2012 = mean(tfp_b2012))

constructed <- dependent_var %>% filter(country_iso == "USA") %>% select(year, industry_code, log_fcit)
industry_level_corr <- left_join(constructed, industry_level, by = c("industry_code", "year")) %>% group_by(industry_code) %>% summarize(cor(log_fcit, tfp_b2012))

kable(industry_level_corr, caption = "Industry-level correlations: Constructed TFP vs. BLS Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))
Industry-level correlations: Constructed TFP vs. BLS Measure
industry_code cor(log_fcit, tfp_b2012)
D10T12 0.5190146
D10T33 NA
D13T15 -0.2008006
D16 0.0029207
D17 0.3509064
D18 0.6310677
D20T21 0.3777503
D22 0.6723993
D23 0.4335162
D24T25 0.6566184
D26 0.8548484
D27 0.7303947
D28 0.6521289
D29T30 0.7149755
D31T32 0.6050431
constructed <- naadj_dependent_var %>% filter(country_iso == "USA") %>% select(year, industry_code, log_fcit) 

industry_level_corr_adj <- left_join(constructed, industry_level, by = c("industry_code", "year")) %>% group_by(industry_code) %>% summarize(cor(log_fcit, tfp_b2012))

kable(industry_level_corr_adj, caption = "Industry-level correlations: Constructed TFP vs. BLS Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))                                                                            
Industry-level correlations: Constructed TFP vs. BLS Measure
industry_code cor(log_fcit, tfp_b2012)
D10T12 0.5190146
D10T33 NA
D13T15 -0.2008006
D16 0.0029207
D17 0.3509064
D18 0.6310677
D20T21 0.3777503
D22 0.6723993
D23 0.4335162
D24T25 0.6566184
D26 0.8548484
D27 0.7303947
D28 0.6521289
D29T30 0.7149755
D31T32 0.6050431

We notice that the unadjusted measure shows better correlations with TFP measures. However, the difference is minimal. To the extent that the adjustments allow us to increase our sample size by over 500 observations, we prefer to use the adjusted TFP measure.

3. Building dataset for regression

rm(list=ls())
explanatory_var <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/SPED.csv")
dependent_var <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")
rnd <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/RnD.csv")

explanatory_var <- explanatory_var %>% filter(from == "USA" | from == "FRA" | from == "JPN" | from == "GBR" | from == "DEU") %>% filter(to == "CAN" | to == "ITA" | to == "AUS" | to == "NOR" | to == "FIN" | to == "DNK" | to == "ESP" | to == "NLD" | to == "SWE")

rnd <- rnd %>% filter(country_iso == "USA" | country_iso  == "FRA" | country_iso  == "JPN" | country_iso  == "GBR" | country_iso  == "DEU")

frame1 <- left_join(explanatory_var, rnd, by = c("from" = "country_iso", "year" = "year")) #expected 585x15 = 8190 got 8181 instead, some missing, that's okk. 

frame2 <- left_join(frame1, dependent_var, by = c("to" = "country_iso", "year" = "year")) #expected 8181x15 = 122,715 got 120,954, close enough. 

write.csv(frame2, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/regression_data.csv")