title: “AdvancedR_JEDSI_2023” author: “Ivy Ortiz” date: “2023-08-10” output: html_document: toc: yes toc_float: true code_folding: hide —

Importation

Datasets

LCover <- read.csv("WPF_County_LCover.csv")
#View(LCover)
FAccess <- read.csv("GR_FoodAccess2.csv")
#View(FAccess)

Packages

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggfortify)
library(corrplot)

## corrplot 0.92 loaded

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggthemes)
library(ggdist)
library(ggplot2)

Data Modification

Recoding

#LCover$ReLCover[((LCover$Landcover == 'Woody Wetlands')|(LCover$Landcover=='Evergreen Forest')|(LCover$Landcover=='Deciduous Forest')|(LCover$Landcover=='Mixed Forest'))] <- "Forest"
#LCover$ReLCover[((LCover$Landcover == 'Developed, Medium Intensity')|(LCover$Landcover=='Developed, High Intensity'))] <- "HDevelop"
#LCover$ReLCover[((LCover$Landcover == 'Hay/Pasture')|(LCover$Landcover=='Developed, Open Space')|(LCover$Landcover=='Developed, Low Intensity')|(LCover$Landcover=='Cultivated Crops'))] <- "LDevelop"
#LCover$ReLCover[((LCover$Landcover == 'Barren Land')|(LCover$Landcover== 'Emergent Herbaceous Wetlands')|(LCover$Landcover=='Herbaceous')|(LCover$Landcover=='Open Water')|(LCover$Landcover=='Shrub/Scrub'))] <- "Conserve"

#write.csv(LCover,'DRWS_LCover_Recode.csv',row.names=FALSE)

Aggregating Area DRWS Land Cover

ReLC <- read.csv("DRWS_LCover_Recode.csv")
arNum <- as.numeric(ReLC$Area)

## Warning: NAs introduced by coercion

aggLCA <- aggregate(arNum~ReLC$COUNTY+ReLC$STATE+ReLC$ReLCover, FUN=sum)
#View(aggLCA)

Calculating Percentages

ReLC2 <- aggLCA
st <- ReLC2$`ReLC$STATE`
ct <- ReLC2$`ReLC$COUNTY`

ReLC2$Location <- paste(ReLC2$`ReLC$COUNTY`,ReLC2$`ReLC$STATE`,sep="_")
#View(ReLC2)

aggReLC2 <- aggregate(ReLC2$arNum~ReLC2$`ReLC$COUNTY`+ReLC2$`ReLC$STATE`,FUN=sum)
aggReLC2$Location <- paste(aggReLC2$`ReLC2$\`ReLC$COUNTY\``,aggReLC2$`ReLC2$\`ReLC$STATE\``,sep="_")
#View(aggReLC2)

dfx <- ReLC2 %>% inner_join( aggReLC2, 
        by=c('Location'='Location'))
#View(dfx)

dfx$paLC <- (dfx$arNum/dfx$`ReLC2$arNum`)*100

Recoding Thresholds

dfx$cpaLC[dfx$paLC<=30] <- "low"
dfx$cpaLC[30>dfx$paLC&dfx$palLC<50] <- "medium"
dfx$cpaLC[dfx$paLC>=50] <- "high"

Creating New Fields

PAsian <- (FAccess$Asian/FAccess$TotPop)*100
PBIPOC <- ((FAccess$TotPop-FAccess$White)/FAccess$TotPop)*100
PBlack <- (FAccess$Black/FAccess$TotPop)*100
PHawaiianPI <- (FAccess$HawaiianPI/FAccess$TotPop)*100
PHispanic <- (FAccess$Hispanic/FAccess$TotPop)*100
PNativeAmerican <- (FAccess$NativeAmerican/FAccess$TotPop)*100
POtherRace <- (FAccess$OtherRace/FAccess$TotPop)*100
PWhite <- (FAccess$White/FAccess$TotPop)*100
Incomeadj <- FAccess$MedianHHIncome^(1/2)
PSNAP <- FAccess$HHSNAP/FAccess$TotalHouseholds
PGED <- FAccess$GED/FAccess$X25pTotal
PBachelor <- FAccess$Bachelor/FAccess$X25pTotal
FAccessBind <- cbind(FAccess,PAsian,PBIPOC,PBlack,PHawaiianPI,PHispanic,PNativeAmerican,POtherRace,PWhite,Incomeadj,PSNAP,PGED,PBachelor)
FAccessBind$CBlack[FAccessBind$PBlack<=30] <- "low"
FAccessBind$CBlack[FAccessBind$PBlack>30] <- "high"
FAccessBind$CHispanic[FAccessBind$PHispanic<=30] <- "low"
FAccessBind$CHispanic[FAccessBind$PHispanic>30] <- "high"
FAccessBind$CBIPOC[FAccessBind$PBIPOC<=40] <- "low"
FAccessBind$CBIPOC[FAccessBind$PBIPOC>40] <- "high"
#(FAccessBind)

OutletsZip <- read.csv("GR_FoodAccess_OutletsZip.csv")

dfy <- FAccessBind %>% inner_join( OutletsZip, 
        by=c('GEOID'='GEOID'))
#View(dfy)

Graphics

corplot

GRdf0 <- dfy[complete.cases(dfy),]
GRdf1 <- GRdf0[-c(1:19,32:40,42:56,58:68,70:72,74:83)]
#View(GRdf0)
CorGRdf1 <- cor(GRdf1)
corrplot(CorGRdf1, is.corr = FALSE, method = "square")

Linear Models

fit1 <- lm(dfy$Coffee ~ dfy$PBIPOC + dfy$PBachelor + dfy$Incomeadj)
summary(fit1)

## 
## Call:
## lm(formula = dfy$Coffee ~ dfy$PBIPOC + dfy$PBachelor + dfy$Incomeadj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8757 -0.7816 -0.2667  0.4414  5.1822 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.852394   1.053370   2.708 0.008004 ** 
## dfy$PBIPOC    -0.016084   0.008939  -1.799 0.075077 .  
## dfy$PBachelor  6.187661   1.547019   4.000 0.000124 ***
## dfy$Incomeadj -0.012714   0.003979  -3.195 0.001887 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.293 on 97 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1939, Adjusted R-squared:  0.169 
## F-statistic: 7.779 on 3 and 97 DF,  p-value: 0.0001044

par(mfrow=c(2,2))
plot(fit1)

x1 <- dfy[c(21,22,24,27,28,29,30,31)]
ggpairs(x1)

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).

## Warning: Removed 1 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).

ggplot

OutletType <- read.csv("Outlets_Type.csv")
#View(OutletType)
gg1 <- ggplot(OutletType,
       aes(x = OutletCode,
           y = FREQUENCY,
           fill = FoodOutletType))+
  geom_col(stat = "identity")+
  scale_fill_viridis_d(name = "Group", option = "turbo")+
  labs(title = "Outlet", y = "Count", x = "Group")

## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

gg1 + theme_dark()

cor.test(dfy$PBachelor, dfy$Pharmacy, use = "complete obs", method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dfy$PBachelor and dfy$Pharmacy
## t = 3.4631, df = 100, p-value = 0.0007874
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1417919 0.4905021
## sample estimates:
##       cor 
## 0.3272431

gg2 <- ggplot(dfy,
       aes(x = PBachelor,
           y = Pharmacy,
           color = TotPop))+
  geom_point()+
  geom_smooth()+
 labs(title = "Pharmacy ~ PBachelor %",
       x = "PBachelor %",
       y = "Pharmacy")

gg2 + theme_dark()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ttest1 <- t.test(Coffee~CBlack, data=dfy, paired=F)
ttest1

## 
##  Welch Two Sample t-test
## 
## data:  Coffee by CBlack
## t = -2.5654, df = 64.324, p-value = 0.01265
## alternative hypothesis: true difference in means between group high and group low is not equal to 0
## 95 percent confidence interval:
##  -0.9268659 -0.1153418
## sample estimates:
## mean in group high  mean in group low 
##          0.2857143          0.8068182

gg3 <- ggplot(dfy,
       aes(x = CBlack,
           y = Coffee,
           fill = CBlack))+
  geom_col(stat = "identity")+
  scale_fill_viridis_d(name = "Group", option = "turbo")+
  labs(title = "Coffee in Tracts with High and Low Pct Black Residents", y = "Count", x = "Group")

## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

gg3+theme_dark()

ttest2 <- t.test(Pharmacy~CHispanic, data=dfy, paired=F)
ttest2

## 
##  Welch Two Sample t-test
## 
## data:  Pharmacy by CHispanic
## t = -5.6145, df = 90, p-value = 2.165e-07
## alternative hypothesis: true difference in means between group high and group low is not equal to 0
## 95 percent confidence interval:
##  -0.7885049 -0.3763303
## sample estimates:
## mean in group high  mean in group low 
##          0.0000000          0.5824176

gg4 <- ggplot(dfy,
       aes(x = CHispanic,
           y = Pharmacy,
           fill = CHispanic))+
  geom_col(stat = "identity")+
  scale_fill_viridis_d(name = "Group", option = "turbo")+
  labs(title = "Pharmacy in Tracts with High and Low Pct Hispanic Residents", y = "Count", x = "Group")

## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

gg4+theme_dark()

plotly

p1 <- dfy %>% 
  plot_ly(x = ~Shape_Area, 
          z = ~Incomeadj, 
          y = ~TotPop,
          color = ~TotPop,
          colors = c("red", "blue")) %>% 
  layout(title = 'Income by Area and Total Population',
         scene = list(xaxis = list(title = 'Area'),
                     yaxis = list(title = 'Total Population'),
                     zaxis = list(title = 'Income Adj ^1/2')))
p1

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d

## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: Ignoring 1 observations

p2 <- dfy %>% 
  plot_ly(x = ~PBlack, 
          z = ~Incomeadj, 
          y = ~PBachelor,
          color = ~TotPop,
          colors = c("green", "blue")) %>% 
  layout(title = 'Income by Pct Black Residents and Pct Bachelor Degrees',
         scene = list(xaxis = list(title = 'Pct Black'),
                     yaxis = list(title = 'Pct Bachor Degrees'),
                     zaxis = list(title = 'Income Adj ^1/2')))
p2

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: Ignoring 1 observations

p3 <- dfy %>% 
  plot_ly(x = ~PHispanic, 
          z = ~Incomeadj, 
          y = ~PBachelor,
          color = ~TotPop,
          colors = c("red", "blue")) %>% 
  layout(title = 'Income by Pct Hispanic  Residents and Pct Bachelor Degrees',
         scene = list(xaxis = list(title = 'Pct Hispanic'),
                     yaxis = list(title = 'Pct Bachor Degrees'),
                     zaxis = list(title = 'Income Adj ^1/2')))
p3

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: Ignoring 1 observations

Principal Component Analysis

x2 <- dfy[c(21,22,24,27,28,29,30,31,41,57,69,73)]
x3 <- x2[complete.cases(x2),]
pca_df <- prcomp(x3,scale. = TRUE)
autoplot(pca_df)

autoplot(pca_df, data = x3, label = TRUE)

biplot(pca_df)