Parte 2 examen final

Pregunta 1 Data INEI:

library(rio)
inei=import("reporteEXA.xlsx")
names(inei)

##  [1] "Código"                 "Provincia"              "No usa electricidad"   
##  [4] "Sí usa electricidad"    "No usa gas (balón GLP)" "Sí usa gas (balón GLP)"
##  [7] "No usa carbón"          "Sí usa carbón"          "No usa leña"           
## [10] "Sí usa leña"

Análisis factorial con los datos:

str(inei)

## 'data.frame':    197 obs. of  10 variables:
##  $ Código                : chr  "101" "102" "103" "104" ...
##  $ Provincia             : chr  "Amazonas, provincia: Chachapoyas" "Amazonas, provincia: Bagua" "Amazonas, provincia: Bongara" "Amazonas, provincia: Condorcanqui" ...
##  $ No usa electricidad   : num  14763 20313 7689 9853 13112 ...
##  $ Sí usa electricidad   : num  574 161 124 14 90 65 255 921 16 33 ...
##  $ No usa gas (balón GLP): num  4696 10557 3154 8331 6863 ...
##  $ Sí usa gas (balón GLP): num  10641 9917 4659 1536 6339 ...
##  $ No usa carbón         : num  15161 20185 7755 9841 13169 ...
##  $ Sí usa carbón         : num  176 289 58 26 33 26 335 218 4 4 ...
##  $ No usa leña           : num  7236 7357 2345 1059 1833 ...
##  $ Sí usa leña           : num  8101 13117 5468 8808 11369 ...

inei=inei[-c(197),]

names(inei)[names(inei)=="Código"]<-"Codigo"

inei$Codigo<-as.numeric(inei$Codigo)
str(inei$Codigo)

##  num [1:196] 101 102 103 104 105 106 107 201 202 203 ...

summary(inei)

##      Codigo        Provincia         No usa electricidad Sí usa electricidad
##  Min.   : 101.0   Length:196         Min.   :    687     Min.   :     1.00  
##  1st Qu.: 507.8   Class :character   1st Qu.:   8176     1st Qu.:    57.75  
##  Median :1056.0   Mode  :character   Median :  15798     Median :   146.50  
##  Mean   :1145.9                      Mean   :  40812     Mean   :  1291.95  
##  3rd Qu.:1702.2                      3rd Qu.:  32071     3rd Qu.:   479.00  
##  Max.   :2504.0                      Max.   :2228751     Max.   :125199.00  
##  No usa gas (balón GLP) Sí usa gas (balón GLP) No usa carbón    
##  Min.   :   572         Min.   :    116        Min.   :    600  
##  1st Qu.:  4649         1st Qu.:   2740        1st Qu.:   8193  
##  Median :  7776         Median :   7402        Median :  15618  
##  Mean   : 12711         Mean   :  29393        Mean   :  40983  
##  3rd Qu.: 12664         3rd Qu.:  17661        3rd Qu.:  31502  
##  Max.   :515782         Max.   :1838168        Max.   :2329530  
##  Sí usa carbón      No usa leña       Sí usa leña   
##  Min.   :    4.0   Min.   :    175   Min.   :  510  
##  1st Qu.:   33.0   1st Qu.:   1801   1st Qu.: 4987  
##  Median :  106.5   Median :   5155   Median : 9004  
##  Mean   : 1120.6   Mean   :  30459   Mean   :11645  
##  3rd Qu.:  348.2   3rd Qu.:  16494   3rd Qu.:15076  
##  Max.   :29288.0   Max.   :2310046   Max.   :58210

sum(is.na(inei))

## [1] 0

library(psych)
library(GPArotation)

## 
## Attaching package: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

Seleccionando:

library(BBmisc)

## 
## Attaching package: 'BBmisc'

## The following object is masked from 'package:base':
## 
##     isFALSE

inei[,c(4,6,8,10)]=normalize(inei[,c(4,6,8,10)],method='standardize')

theData=inei[,c(4,6,8,10)]
library(magrittr)
head(theData,10)%>%
  rmarkdown::paged_table()

library(polycor)

## 
## Attaching package: 'polycor'

## The following object is masked from 'package:psych':
## 
##     polyserial

summary(theData)

##  Sí usa electricidad Sí usa gas (balón GLP) Sí usa carbón      Sí usa leña     
##  Min.   :-0.14275    Min.   :-0.21565       Min.   :-0.2870   Min.   :-1.1077  
##  1st Qu.:-0.13647    1st Qu.:-0.19633       1st Qu.:-0.2795   1st Qu.:-0.6623  
##  Median :-0.12666    Median :-0.16198       Median :-0.2606   Median :-0.2628  
##  Mean   : 0.00000    Mean   : 0.00000       Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.08989    3rd Qu.:-0.08642       3rd Qu.:-0.1985   3rd Qu.: 0.3413  
##  Max.   :13.70080    Max.   :13.32348       Max.   : 7.2387   Max.   : 4.6322

library(DescTools)

## 
## Attaching package: 'DescTools'

## The following object is masked from 'package:BBmisc':
## 
##     %nin%

## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(kableExtra)

cor(theData)

##                        Sí usa electricidad Sí usa gas (balón GLP) Sí usa carbón
## Sí usa electricidad              1.0000000              0.9920048     0.4873608
## Sí usa gas (balón GLP)           0.9920048              1.0000000     0.5269917
## Sí usa carbón                    0.4873608              0.5269917     1.0000000
## Sí usa leña                      0.2868231              0.3426981     0.4168478
##                        Sí usa leña
## Sí usa electricidad      0.2868231
## Sí usa gas (balón GLP)   0.3426981
## Sí usa carbón            0.4168478
## Sí usa leña              1.0000000

corMatrix=polycor::hetcor(theData)$correlations

round(corMatrix,2)

##                        Sí usa electricidad Sí usa gas (balón GLP) Sí usa carbón
## Sí usa electricidad                   1.00                   0.99          0.49
## Sí usa gas (balón GLP)                0.99                   1.00          0.53
## Sí usa carbón                         0.49                   0.53          1.00
## Sí usa leña                           0.29                   0.34          0.42
##                        Sí usa leña
## Sí usa electricidad           0.29
## Sí usa gas (balón GLP)        0.34
## Sí usa carbón                 0.42
## Sí usa leña                   1.00

KMO

library(psych)
psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.56
## MSA for each item = 
##    Sí usa electricidad Sí usa gas (balón GLP)          Sí usa carbón 
##                   0.52                   0.53                   0.81 
##            Sí usa leña 
##                   0.52

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05

## [1] FALSE

library(matrixcalc)

is.singular.matrix(corMatrix)

## [1] FALSE

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

resfa <- fa(theData,
            nfactors = 2,
            cor = 'mixed',
            rotate = "varimax",
            fm="minres")
print(resfa$loadings)

## 
## Loadings:
##                        MR1   MR2  
## Sí usa electricidad    0.968 0.242
## Sí usa gas (balón GLP) 0.942 0.330
## Sí usa carbón          0.365 0.554
## Sí usa leña            0.130 0.667
## 
##                  MR1   MR2
## SS loadings    1.974 0.919
## Proportion Var 0.494 0.230
## Cumulative Var 0.494 0.723

print(resfa$loadings,cutoff = 0.5)

## 
## Loadings:
##                        MR1   MR2  
## Sí usa electricidad    0.968      
## Sí usa gas (balón GLP) 0.942      
## Sí usa carbón                0.554
## Sí usa leña                  0.667
## 
##                  MR1   MR2
## SS loadings    1.974 0.919
## Proportion Var 0.494 0.230
## Cumulative Var 0.494 0.723

Cada variable se fue a un factor, es una estructura simple

Pregunta 2

covid=import("dataOK_all.xlsx")

## New names:
## • `` -> `...1`

names(covid)

##  [1] "...1"                    "key"                    
##  [3] "Código"                  "pared1_Ladrillo"        
##  [5] "pared2_Piedra"           "pared3_Adobe"           
##  [7] "pared4_Tapia"            "pared5_Quincha"         
##  [9] "pared6_Piedra"           "pared7_Madera"          
## [11] "pared8_Triplay"          "pared9_Otro"            
## [13] "pared10_Total"           "techo1_Concreto"        
## [15] "techo2_Madera"           "techo3_Tejas"           
## [17] "techo4_Planchas"         "techo5_Caña"            
## [19] "techo6_Triplay"          "techo7_Paja"            
## [21] "techo8_Otro"             "techo9_Total"           
## [23] "piso1_Parquet"           "piso2_Láminas"          
## [25] "piso3_Losetas"           "piso4_Madera"           
## [27] "piso5_Cemento"           "piso6_Tierra"           
## [29] "piso7_Otro"              "piso8_Total"            
## [31] "agua1_Red"               "agua2_Red_fueraVivienda"
## [33] "agua3_Pilón"             "agua4_Camión"           
## [35] "agua5_Pozo"              "agua6_Manantial"        
## [37] "agua7_Río"               "agua8_Otro"             
## [39] "agua9_Vecino"            "agua10_Total"           
## [41] "elec1_Sí"                "elec2_No"               
## [43] "elec3_Total"             "departamento"           
## [45] "provincia"               "Castillo"               
## [47] "Keiko"                   "ganaCastillo"           
## [49] "covidPositivos"          "covidFallecidos"

covidk=covid[,c(31,40,43,45,46,47,48,49,50)]

covid[,c(31,40,43,45,46,47,48,49,50)]=normalize(covid[,c(31,40,43,45,46,47,48,49,50)],method='standardize')

tasa de fallecidos por cada 1000

covidk$tasa=covidk$covidPositivos/covidk$elec3_Total*1000
str(covidk$tasa)

##  num [1:196] 416.3 53.2 155.3 358.3 35.7 ...

Pregunta 3 Data CIA

library (rio)
elec=import("Electricity - installed generating capacity.csv")
names(elec)

## [1] "name"                "slug"                "kW"                 
## [4] "date_of_information" "ranking"             "region"

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:kableExtra':
## 
##     group_rows

## The following objects are masked from 'package:BBmisc':
## 
##     coalesce, collapse, symdiff

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df=elec%>%
  select("name", "kW")

produc=import("Refined petroleum products - production.csv")
names(produc)

## [1] "name"                "slug"                "bbl/day"            
## [4] "date_of_information" "ranking"             "region"

df2=produc%>%
  select("name", "bbl/day")

carbon=import("Carbon dioxide emissions.csv")
names(carbon)

## [1] "name"                 "slug"                 "metric tonnes of CO2"
## [4] "date_of_information"  "ranking"              "region"

df3=carbon%>%
  select("name", "metric tonnes of CO2")

energy=import("Energy consumption per capita.csv")
names(energy)

## [1] "name"                "slug"                "Btu/person"         
## [4] "date_of_information" "ranking"             "region"

df4=energy%>%
  select("name", "Btu/person")

telefix=import("Telephones - fixed lines.csv")
names(telefix)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

df5=telefix%>%
  select("name", "value")
names(df5)[names(df5)=="value"]="valuefix"

telemob=import("Telephones - mobile cellular.csv")
names(telemob)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

df6=telemob%>%
  select("name", "value")
names(df6)[names(df6)=="value"]="valuemob"

broad=import("Broadband - fixed subscriptions.csv")
names(broad)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

df11=broad%>%
  select("name", "value")
names(df11)[names(df11)=="value"]="valuebroad"

inflat=import("Inflation rate (consumer prices).csv")
names(inflat)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

df7=inflat%>%
  select("name", "%")
names(df7)[names(df7)=="%"]="% of inflation"

youth=import("Youth unemployment rate (ages 15-24).csv")
names(youth)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

df8=youth%>%
  select("name", "%")
names(df8)[names(df8)=="%"]="% of youth unemployment"

debt=import("Public debt.csv")
names(debt)

## [1] "name"                "slug"                "% of GDP"           
## [4] "date_of_information" "ranking"             "region"

df9=debt%>%
  select("name", "% of GDP")
names(df9)[names(df9)=="% of GDP"]="% public debt"

debtex=import("Public debt.csv")
names(debtex)

## [1] "name"                "slug"                "% of GDP"           
## [4] "date_of_information" "ranking"             "region"

df10=debtex%>%
  select("name", "% of GDP")
names(df10)[names(df10)=="% of GDP"]="% public debt-external"

datafinal<-merge(df, df2, by = "name")

cia=import("datafinal.xlsx")
names(cia)

##  [1] "name"                    "kW"                     
##  [3] "bbl/day"                 "metric tonnes of CO2"   
##  [5] "Btu/person"              "valuefix"               
##  [7] "valuemob"                "% of inflation"         
##  [9] "% of youth unemployment" "% public debt"          
## [11] "% public debt-external"  "valuebroad"

str(cia)

## 'data.frame':    181 obs. of  12 variables:
##  $ name                   : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ kW                     : chr  "776,000" "2,531,000" "21,694,000" "7,344,000" ...
##  $ bbl/day                : chr  "0" "5,638" "627,900" "53,480" ...
##  $ metric tonnes of CO2   : chr  "7,893,000" "3,794,000" "151,633,000" "19,362,000" ...
##  $ Btu/person             : chr  "3,227,000" "38,442,000" "61,433,000" "11,693,000" ...
##  $ valuefix               : chr  "146,000" "177,000" "5,576,000" "94,000" ...
##  $ valuemob               : chr  "22,678,000" "2,782,000" "49,019,000" "23,978,000" ...
##  $ % of inflation         : chr  "2.3" "6.73" "9.27" "25.75" ...
##  $ % of youth unemployment: num  20.2 27.8 31.9 18.5 29.9 36.1 10.8 11.4 16.5 30.8 ...
##  $ % public debt          : num  7 82.4 27.5 65 57.6 ...
##  $ % public debt-external : num  7 82.4 27.5 65 57.6 ...
##  $ valuebroad             : chr  "26,570" "508,937" "3,790,459" "230,610" ...

cia$`metric tonnes of CO2`<- gsub(",", "", cia$`metric tonnes of CO2`)
str(cia$`metric tonnes of CO2`)

##  chr [1:181] "7893000" "3794000" "151633000" "19362000" "193205000" ...

cia$CO2<- as.numeric(cia$`metric tonnes of CO2`)
str(cia$CO2)

##  num [1:181] 7.89e+06 3.79e+06 1.52e+08 1.94e+07 1.93e+08 ...

valuefix-fixed lines

cia$valuefix<- gsub(",", "", cia$valuefix)
cia$fixed<- as.numeric(cia$valuefix)
str(cia$fixed)

##  num [1:181] 146000 177000 5576000 94000 7615000 ...

cia$kW <- gsub(",", "", cia$kW)
cia$elecap <- as.numeric(cia$kW)
str(cia$elecap)

##  num [1:181] 776000 2531000 21694000 7344000 44731000 ...

Regresión: fixlines y co2 Independientes: energy = kw, bbl/day, btu/person communicatios = valuefix, valuemob, value broad

Dependiente: economy = % of inflation, % youth, % public debt

modelo1<-lm(cia$`% public debt` ~ CO2+fixed, data=cia)
summary(modelo1)

## 
## Call:
## lm(formula = cia$`% public debt` ~ CO2 + fixed, data = cia)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.258 -21.083  -4.635  15.639 174.016 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.348e+01  2.669e+00  20.034  < 2e-16 ***
## CO2         -4.262e-08  8.138e-09  -5.237 4.57e-07 ***
## fixed        2.618e-06  4.463e-07   5.865 2.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.09 on 178 degrees of freedom
## Multiple R-squared:  0.1645, Adjusted R-squared:  0.1551 
## F-statistic: 17.53 on 2 and 178 DF,  p-value: 1.127e-07

Parte 2 examen final

Ana Gutierrez Montufar

2024-11-28