The R&D stock data is constructed following the perpetual inventory method. The R&D spending flows series are extracted from OECD’s ANBERD database.
rnd <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/anberd.csv")
rnd <- rnd %>% select(LOCATION, Country, IND, Industry, TIME, Value, Measure, Flag.Codes)
colnames(rnd) <- c("country_iso", "country_name", "industry_code", "industry_name", "year", "rd_spending", "unit", "estimated")
#rnd2 <- rnd %>% expand(country_iso, industry_code, year), not working, not worrying about this rn
#Creating R&D stock measures : first I compute yearly growth rates for spending, then I set the starting value at t = 2000, and compute the stock for each year as the non-depreciated stock from period t-1 + t-1 spending. The depreciation rate is uniformily set to 0.1.
rnd <- rnd %>%
filter(unit == "PPP dollars - 2015 prices") %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(grd = ifelse(year > 2000, (rd_spending - lag(rd_spending))/lag(rd_spending), NA)) %>%
mutate(meangrd = mean(grd, na.rm = TRUE)) %>%
mutate(rdstock = ifelse(year == 2000, rd_spending / (meangrd + 0.1), 0)) %>%
mutate(rdstock = ifelse(year > 2000, 0.9*lag(rdstock) + lag(rd_spending), rdstock))
write.csv(rnd, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/RnD.csv")
#Visualizing stocks for G5 countries
p1 <- rnd %>% group_by(country_iso, year) %>% summarize(rdstock = sum(rdstock)) %>% ggplot(aes(x = year, y = rdstock))
p1 + geom_bar(stat="identity", width = .5, colour="black", fill="white") + facet_grid(. ~ country_iso)
This exercise follows as much as possible the approach from Keller 2002. However, the capital stock adjustment performed in Keller is not replicated, and the labor shares used here are income rather than cost based.
stan <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/stan.csv")
stan <- stan %>% select(LOCATION, Country, IND, Industry, Time, VAR, Variable, Value, Unit.Code, PowerCode, Reference.Period.Code)
colnames(stan) <- c("country_iso", "country_name", "industry_code", "industry_name", "year", "variable_code", "variable_name", "value", "currency", "unit", "reference_year")
#Generating key variables
#1. Value added at constant 2010 USD : Generate constant local prices series, then divide by 2010 value to get index. Multiply index by series expressed in USD using 2010 conversion rates.
library(dplyr)
exr <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/PPPXrate.csv")
exr <- exr %>%
select(LOCATION, INDICATOR, TIME, Value) %>%
dplyr::rename("country_iso" = "LOCATION", "indicator" = "INDICATOR", "year" = "TIME", "xr" = "Value") %>%
filter(year == 2010) %>%
select(-year)
va1 <- stan %>%
right_join(exr, by = "country_iso") %>%
filter(variable_code == "VALU") %>%
dplyr::rename(value_added = value) %>%
select(country_iso, industry_code, year, value_added, xr)
va2 <- stan %>%
filter(variable_code == "VALP") %>% #VA deflator
dplyr::rename(va_deflator = value) %>%
select(country_iso, industry_code, year, va_deflator)
va <- left_join(va1, va2, by = c("country_iso", "industry_code", "year")) %>%
mutate(cpva = value_added*100/va_deflator) %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(index = cpva/cpva[11]) %>%
mutate(rva = index*value_added*xr) %>%
mutate(log_rva = log(rva)) %>%
group_by(country_iso, year) %>%
mutate(mean_log_rva = mean(log_rva, na.rm = T))
#2. Generate labor input term
lab1 <- stan %>%
filter(variable_code == "EMPN") %>%
dplyr::rename(tot_employed = value) %>%
select(country_iso, industry_code, year, tot_employed)
lab2 <- stan %>%
filter(variable_code == "HRSN" ) %>%
dplyr::rename(hrs_worked = value) %>% #Check description in STAN database
select(country_iso, industry_code, year, hrs_worked)
lab <- left_join(lab1, lab2, by = c("country_iso", "industry_code", "year")) %>%
mutate(lab = tot_employed*hrs_worked) %>%
mutate(log_labor = log(lab)) %>%
group_by(country_iso, year) %>%
mutate(mean_log_labor = mean(log_labor, na.rm = T))
#3. Capital : take gross fixed capital in current prices, use gross fixed capital formation deflator to generate constant local price series, divide by 2010 to get index. Multiply index by series expressed in USD using 2010 conversion rates. This gives the flow. Then I create the stock using inventory method. Industry specific capital depreciation estimates taken from the BEA estimates.
cap1 <- stan %>%
right_join(exr, by = "country_iso") %>%
filter(variable_code == "GFCF") %>%
dplyr::rename(cap_flow = value) %>%
select(country_iso, industry_code, year, cap_flow, xr)
cap2 <- stan %>%
filter(variable_code == "GFCP") %>% #capital formation deflator
dplyr::rename(cap_deflator = value) %>%
select(country_iso, industry_code, year, cap_deflator)
cap <- left_join(cap1, cap2, by = c("country_iso", "industry_code", "year")) %>%
mutate(cpcap = cap_flow*100/cap_deflator) %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(index = cpcap/cpcap[11]) %>%
mutate(rcapflow = index*cap_flow*xr)
#Now have real flow measure, now need to create stock measure. first I create a dataframe of capital depreciation rate by industry. Then I use the perpetual inventory method to compute the stock at every time t
deprates <- stan %>%
distinct(industry_code, industry_name) %>% #from here https://apps.bea.gov/national/pdf/BEA_depreciation_rates.pdf, two decimals, where need consolidation takes average. 50% special industry machinery, 50% computer and peripheral equipment. rounded.
cbind(c(NA, 0.09, 0.12, 0.15, 0.11, 0.12, 0.11, 0.13, 0.1, 0.08, 0.13, 0.13, 0.07, 0.12, 0.12))
colnames(deprates) <- c("industry_code", "industry_name", "dep_rate")
deprates <- deprates %>% select(industry_code, dep_rate)
cap <- cap %>%
left_join(deprates, by = "industry_code") %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(gcf = ifelse(year > 2000, (rcapflow - lag(rcapflow))/lag(rcapflow), NA)) %>%
mutate(meangcf = mean(gcf, na.rm = TRUE)) %>%
mutate(capstock = ifelse(year == 2000, rcapflow / (meangcf + dep_rate), 0)) %>%
mutate(capstock = ifelse(year > 2000, (1-dep_rate)*lag(capstock) + lag(rcapflow), capstock)) %>%
mutate(log_capstock = log(capstock)) %>%
group_by(country_iso, industry_code) %>%
mutate(mean_log_capstock = mean(log_capstock, na.rm = T))
#4. Adjusting capital : Skipped
#5. Computing income share of labor, and of capital
share1 <- stan %>%
filter(variable_code == "LABR") %>%
dplyr::rename(labor_costs = value) %>%
select(country_iso, industry_code, year, labor_costs)
share2 <- stan %>%
filter(variable_code == "PROD") %>%
dplyr::rename(production = value) %>%
select(country_iso, industry_code, year, production)
share <- left_join(share1, share2, by = c("country_iso", "industry_code", "year")) %>%
mutate(lab_share = labor_costs/production) %>%
mutate(cap_share = 1 - lab_share)
# 6. Computing TFP
a <- left_join(va, lab, by = c("country_iso", "industry_code", "year"))
b <- left_join(cap, share, by = c("country_iso", "industry_code", "year"))
dependent_var <- left_join(a, b, by = c("country_iso", "industry_code", "year")) %>%
mutate(log_fcit = (log_rva - mean_log_rva) - lab_share*(log_labor - mean_log_labor) - cap_share*(log_capstock - mean_log_capstock))
#7. Plotting constructed outcome variable
ggplot(dependent_var, aes(x=log_fcit)) +
geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") + # Overlay with transparent density plot
geom_vline(aes(xintercept=mean(log_fcit, na.rm=T)), # Ignore NA values for mean
color="red", linetype="dashed", size=1) + ggtitle("Distribution of log TFP (unadjusted)") +
xlab("log(TFP)") + ylab("Density")
write.csv(dependent_var, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")
When it comes to the constructed TFP measure, I have a total of 1243 missing values out of 3410 observations. Over a third of values are missing. Below I break down the sources of missing values.
\(rva_t = \frac{value\_added_t*100}{va\_deflator_t}\times \frac{va\_deflator_{2015}}{value\_added_{2015}*100}*value\_added*xr\)
The value added deflator is missing in 234 instances, which generate 242 missing values for my real value added measure.
\(capstock_t = (1-deprate)\times capstock_{t-1} + rcapflow_{t-1}\) Where, \(capstock_{t=0} = \frac{rcapflow_{t=0}}{meangcf + deprate}\) And $rcapflow = cap_flow_t xr $
The capital stock measure is missing in deflator is missing in 860 instances. This is 405 instances. It’s missing in 40 lines for 2015, with rippling effects (through the time series dimension of the panel) due to series being normalized by its 2015 value.
The third source of missing values is the labor input measure. This measure is defined as follows:
\(labor = total\_employed \times hours\_worked\)
There are 753 missing values, all of which are due to unreported hours worked.
One solution for delfators can be to keep the current series and replace missing value by an approximation. Deflators depend on : global and local variations in input prices, and local price inflation and monetary aggregates. To the extent that the main drivers of price dynamics are experienced more uniformily within a country than within an industry, we can propose the following solution. We can use the average deflator across available industries in a given country (this requires that at least one deflator is reported by country by year).
The number of hours worked is also likely to depend on geographic factors more than industrial factors. Laws regulating work hours, holidays, time-off, etc. are locally determined. Therefore, a possible approximation for missed hours worked could be the cross-industry country average for the given year.
Finally, the income shares could not be computed for all industries in all countries in all years because production series is missing in some 40 instances. Again, I propose to replace the labor shares for industries where its missing by using the local average because I believe that local dynamics and regulations matter more than sector characteristics.
Below I implement these revisions.
adjusted_va <- left_join(va1, va2, by = c("country_iso", "industry_code", "year")) %>%
group_by(country_iso, year) %>%
mutate(average_deflator = mean(va_deflator, na.rm = T)) %>%
ungroup() %>%
mutate(adj_va_deflator = ifelse(is.na(va_deflator), average_deflator , va_deflator)) %>%
mutate(cpva = value_added*100/adj_va_deflator) %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(index = cpva/cpva[11]) %>%
mutate(rva = index*value_added*xr) %>%
mutate(log_rva = log(rva)) %>%
group_by(country_iso, year) %>%
mutate(mean_log_rva = mean(log_rva, na.rm = T))
#Still missing values which dont make sense, reduced by 84% tho.
adjusted_cap <- cap %>%
group_by(country_iso, year) %>%
mutate(average_cap_deflator = mean(cap_deflator, na.rm = T)) %>%
ungroup() %>%
mutate(adj_cap_deflator = ifelse(is.na(cap_deflator), average_cap_deflator , cap_deflator)) %>%
mutate(cpcap = cap_flow*100/adj_cap_deflator) %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(index = cpcap/cpcap[11])
adjusted_cap <- adjusted_cap %>%
ungroup() %>%
mutate(rcapflow = index*cap_flow*xr) %>%
group_by(country_iso, industry_code) %>%
arrange(year) %>%
mutate(gcf = ifelse(year > 2000, (rcapflow - lag(rcapflow))/lag(rcapflow), NA)) %>%
mutate(meangcf = mean(gcf, na.rm = TRUE)) %>%
mutate(capstock = ifelse(year == 2000, rcapflow / (meangcf + dep_rate), 0)) %>%
mutate(capstock = ifelse(year > 2000, (1-dep_rate)*lag(capstock) + lag(rcapflow), capstock)) %>%
mutate(log_capstock = log(capstock)) %>%
group_by(country_iso, industry_code) %>%
mutate(mean_log_capstock = mean(log_capstock, na.rm = T))
#28 flows missing but 265 stocks missing because compounded. so maybe change stock formula to account for missing values.
adjusted_lab <- lab %>%
group_by(country_iso, year) %>%
mutate(average_hrs_worked = mean(hrs_worked, na.rm = T)) %>%
ungroup() %>%
mutate(adj_hrs_worked = ifelse(is.na(hrs_worked), average_hrs_worked, hrs_worked)) %>%
mutate(lab = tot_employed*adj_hrs_worked) %>%
mutate(log_labor = log(lab)) %>%
group_by(country_iso, year) %>%
mutate(mean_log_labor = mean(log_labor, na.rm = T))
#Japan and australia do not report hours worked,. for japan this is not a problem because we do not need japan's TFP, but for australia it is... i could amputate averages across countries.
adjusted_share <- share %>%
group_by(country_iso, year) %>%
mutate(average_lab_share = mean(lab_share, na.rm = T)) %>%
ungroup() %>%
mutate(lab_share = ifelse(is.na(lab_share), average_lab_share, lab_share)) %>%
mutate(cap_share = 1-lab_share)
#Reduced from 45 to 38, not resolved
c <- left_join(adjusted_va, adjusted_lab, by = c("country_iso", "industry_code", "year"))
d <- left_join(adjusted_cap, adjusted_share, by = c("country_iso", "industry_code", "year"))
naadj_dependent_var <- left_join(c, d, by = c("country_iso", "industry_code", "year")) %>%
mutate(log_fcit = (log_rva - mean_log_rva) - lab_share*(log_labor - mean_log_labor) - cap_share*(log_capstock - mean_log_capstock))
#NAs down to 1243 from 705. Left over limited to specific countries with reporting issues.
write.csv(naadj_dependent_var, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")
#Distribution of adjusted variable
ggplot(naadj_dependent_var, aes(x=log_fcit)) +
geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") + # Overlay with transparent density plot
geom_vline(aes(xintercept=mean(log_fcit, na.rm=T)), # Ignore NA values for mean
color="red", linetype="dashed", size=1) + ggtitle("Distribution of log TFP (after NA adjustment)") +
xlab("log(TFP)") + ylab("Density")
The adjustments allow us to recover approximations for 538 of the missing values. We are however still looking at 705 missing.
I run two tests to sanity-check the constructed TFP measure. Two dimensions can be used to compare: the national level TFP and the industry level TFP.
First I check the correlation between national average TFP measures of the 14 industries against cross-country TFP from the OECD.
Second, I check the correlation between industry levels TFP and observed TFP in the US as estimated by BLS.
#Country level
constructed <- dependent_var %>% filter(industry_code != "D10T33") %>% select(country_iso, year, log_fcit) %>% filter(log_fcit != "NaN") %>% group_by(country_iso, year) %>% summarize(log_fcit = mean(log_fcit, na.rm = T))
country_level <- read.csv("/Users/yab/Downloads/DP_LIVE_15062020043109086.csv")
country_level <- country_level %>% select(LOCATION, TIME, Value) %>% dplyr::rename(country_iso = LOCATION, year = TIME, tfp_b2010 = Value)
country_level_corr <- left_join(constructed, country_level, by = c("country_iso", "year")) %>% group_by(country_iso) %>% summarize(cor(log_fcit, tfp_b2010))
constructed <- naadj_dependent_var %>% filter(industry_code != "D10T33") %>% select(country_iso, year, log_fcit) %>% filter(log_fcit != "NaN") %>% group_by(country_iso, year) %>% summarize(log_fcit = mean(log_fcit, na.rm = T))
country_level <- read.csv("/Users/yab/Downloads/DP_LIVE_15062020043109086.csv")
country_level <- country_level %>% select(LOCATION, TIME, Value) %>% dplyr::rename(country_iso = LOCATION, year = TIME, tfp_b2010 = Value)
country_level_corr_adj <- left_join(constructed, country_level, by = c("country_iso", "year")) %>% group_by(country_iso) %>% summarize(cor(log_fcit, tfp_b2010))
library("kableExtra")
kable(country_level_corr, caption = "Country-level correlations: Constructed TFP vs. OECD Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))
| country_iso | cor(log_fcit, tfp_b2010) |
|---|---|
| CAN | 0.5282437 |
| DEU | 0.5115391 |
| DNK | 0.3193141 |
| FIN | 0.6656947 |
| FRA | 0.6935171 |
| GBR | 0.8457201 |
| ITA | -0.6648197 |
| NLD | 0.6121941 |
| NOR | NA |
| SWE | 0.7535911 |
| USA | 0.5475565 |
kable(country_level_corr_adj, caption = "Country-level correlations: Constructed TFP vs. OECD Measure (NA Adjusted)") %>% kable_styling(bootstrap_options = c("striped", "hover"))
| country_iso | cor(log_fcit, tfp_b2010) |
|---|---|
| CAN | 0.5282437 |
| DEU | 0.4753621 |
| DNK | 0.3193141 |
| ESP | -0.7656644 |
| FIN | 0.6656947 |
| FRA | 0.6979906 |
| GBR | 0.8369423 |
| ITA | -0.6423458 |
| NLD | 0.6121941 |
| NOR | NA |
| SWE | 0.7335012 |
| USA | 0.5475565 |
#Industry level
industry_level <- read.csv("/Users/yab/Downloads/three_digit_industry_mfp_data.csv") #source : https://www.bls.gov/lpc/tables_by_sector_and_industry.htm
industry_level <- industry_level %>% select(Three.Digit.Manufacturing.Industries, Year, Multifactor.Productivity..2012...100.) %>% filter(Year > 1999 & Year < 2017)
colnames(industry_level) <- c("industry", "year", "tfp_b2012") #in the next paragraph i code the industries
industries <- industry_level %>% distinct(industry)
codes <- c("D10T12", "D13T15", "D13T15", "D16", "D17", "D18", "REM", "D20T21", "D22", "D23", "D24T25", "D24T25", "D28", "D26", "D27", "D29T30", "D31T32", "D31T32")
industries <- cbind(industries, codes)
colnames(industries) <- c("industry", "industry_code")
industry_level <- left_join(industry_level, industries) %>% group_by(industry_code, year) %>% summarize(tfp_b2012 = mean(tfp_b2012))
constructed <- dependent_var %>% filter(country_iso == "USA") %>% select(year, industry_code, log_fcit)
industry_level_corr <- left_join(constructed, industry_level, by = c("industry_code", "year")) %>% group_by(industry_code) %>% summarize(cor(log_fcit, tfp_b2012))
kable(industry_level_corr, caption = "Industry-level correlations: Constructed TFP vs. BLS Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))
| industry_code | cor(log_fcit, tfp_b2012) |
|---|---|
| D10T12 | 0.5190146 |
| D10T33 | NA |
| D13T15 | -0.2008006 |
| D16 | 0.0029207 |
| D17 | 0.3509064 |
| D18 | 0.6310677 |
| D20T21 | 0.3777503 |
| D22 | 0.6723993 |
| D23 | 0.4335162 |
| D24T25 | 0.6566184 |
| D26 | 0.8548484 |
| D27 | 0.7303947 |
| D28 | 0.6521289 |
| D29T30 | 0.7149755 |
| D31T32 | 0.6050431 |
constructed <- naadj_dependent_var %>% filter(country_iso == "USA") %>% select(year, industry_code, log_fcit)
industry_level_corr_adj <- left_join(constructed, industry_level, by = c("industry_code", "year")) %>% group_by(industry_code) %>% summarize(cor(log_fcit, tfp_b2012))
kable(industry_level_corr_adj, caption = "Industry-level correlations: Constructed TFP vs. BLS Measure") %>% kable_styling(bootstrap_options = c("striped", "hover"))
| industry_code | cor(log_fcit, tfp_b2012) |
|---|---|
| D10T12 | 0.5190146 |
| D10T33 | NA |
| D13T15 | -0.2008006 |
| D16 | 0.0029207 |
| D17 | 0.3509064 |
| D18 | 0.6310677 |
| D20T21 | 0.3777503 |
| D22 | 0.6723993 |
| D23 | 0.4335162 |
| D24T25 | 0.6566184 |
| D26 | 0.8548484 |
| D27 | 0.7303947 |
| D28 | 0.6521289 |
| D29T30 | 0.7149755 |
| D31T32 | 0.6050431 |
We notice that the unadjusted measure shows better correlations with TFP measures. However, the difference is minimal. To the extent that the adjustments allow us to increase our sample size by over 500 observations, we prefer to use the adjusted TFP measure.
rm(list=ls())
explanatory_var <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/SPED.csv")
dependent_var <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/Dpendent_var_nas.csv")
rnd <- read.csv("/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/RnD.csv")
explanatory_var <- explanatory_var %>% filter(from == "USA" | from == "FRA" | from == "JPN" | from == "GBR" | from == "DEU") %>% filter(to == "CAN" | to == "ITA" | to == "AUS" | to == "NOR" | to == "FIN" | to == "DNK" | to == "ESP" | to == "NLD" | to == "SWE")
rnd <- rnd %>% filter(country_iso == "USA" | country_iso == "FRA" | country_iso == "JPN" | country_iso == "GBR" | country_iso == "DEU")
frame1 <- left_join(explanatory_var, rnd, by = c("from" = "country_iso", "year" = "year")) #expected 585x15 = 8190 got 8181 instead, some missing, that's okk.
frame2 <- left_join(frame1, dependent_var, by = c("to" = "country_iso", "year" = "year")) #expected 8181x15 = 122,715 got 120,954, close enough.
write.csv(frame2, "/Users/yab/Dropbox/Trade projects/Networks/Data/Keller rep/regression_data.csv")