mini4

R Markdown

This is an R Markdown document for Mini Assignment 4. Need to round r and p values for last chart.

#Read csv file from source and create new data frame called dFrameCoffee
dFrameCoffee <-
  read.csv("https://bonwookoo.github.io/UrbanAnalytics2022/Assignment/mini_4/coffee.csv")
#Check to see if dFrameCoffee is in fact a data frame
class(dFrameCoffee)

## [1] "data.frame"

#View columns of the dataframe
names(dFrameCoffee)

##  [1] "X"                "GEOID"            "county"           "hhincome"        
##  [5] "pct_pov"          "review_count"     "avg_rating"       "race.tot"        
##  [9] "avg_price"        "pct_white"        "hhincome_log"     "review_count_log"
## [13] "pct_pov_log"      "yelp_n"

#convert ratings from character for ratings to be categorical
dFrameCoffee$avg_rating = as.character(dFrameCoffee$avg_rating)
##convert review_count_log to numeric
dFrameCoffee$review_count_log = as.numeric(dFrameCoffee$review_count_log)
#Function used to  extract p value from lm
lmp <- function (modelobject) {
    if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(modelobject)$fstatistic
    p <- pf(f[1],f[2],f[3],lower.tail=F)
    attributes(p) <- NULL
    return(p)
}
#END TEST EXTRACT FUNCTION
#draw the boxplot using ggplot
dFrameCoffee |>
  drop_na(avg_rating) |>
  ggplot(aes(avg_rating, hhincome))+
  geom_boxplot()+
  theme_bw()+
  labs(title = "hhincome vs avg_rating")

#facetwrap to divide the data into a specific category in this case counties 
dFrameCoffee |>
  drop_na(avg_rating) |>
  ggplot(aes(avg_rating, hhincome))+
  geom_boxplot()+
  facet_wrap(~county)+
  theme_bw()+
  labs(title = "hhincome vs avg_rating", x = "Average Yelp Rating", y= "Median Household Income")

#facetwrap to divide the data into a specific category in this case counties 
dFrameCoffee |>
  drop_na(review_count_log) |>
  ggplot(aes(review_count_log, hhincome))+
  geom_point(aes(color = pct_white, size = 1, alpha = 0.5))+
  facet_wrap(~county)+
  theme_bw()+
  labs(
    title = "Scatterplot: Review Count vs Household Income",
    x= "Review Count (Log)",
    y= "Median Annual Household Income")+
   scale_color_gradient(low="darkblue", high="red")

#facetwrap to divide the data into a specific category in this case counties
lmod1= lm(review_count_log~hhincome,dFrameCoffee)
summary(lmod1)

## 
## Call:
## lm(formula = review_count_log ~ hhincome, data = dFrameCoffee)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6426 -0.5533  0.0529  0.6004  3.6950 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.144e+00  1.256e-01  25.023   <2e-16 ***
## hhincome    3.719e-06  1.456e-06   2.554   0.0111 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.004 on 361 degrees of freedom
## Multiple R-squared:  0.01774,    Adjusted R-squared:  0.01502 
## F-statistic: 6.521 on 1 and 361 DF,  p-value: 0.01107

typeof(lmod1)

## [1] "list"

pValuePlot1 <- lmp(lmod1)
rValuePlot1 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$hhincome, use = "complete.obs")
typeof(rValuePlot1)

## [1] "double"

print(rValuePlot1)

## [1] 0.1332059

displayPlot1 <- dFrameCoffee |>
  drop_na(review_count_log) |>
  ggplot(aes(review_count_log, hhincome, color = county))+
  geom_point()+
  geom_smooth(mapping = aes(color = county),
              method = "lm", se = FALSE, formula = y ~ x)+
  annotate("text",x=.5,y=200000, label ="R=")+
  annotate("text",x=2,y=200000, label =rValuePlot1)+
  annotate("text", x= 4, y=200000, label = "p=")+
  annotate("text", x= 5.5, y=200000, label = pValuePlot1)+
  #facet_wrap(~county)+
  theme_bw()+
  labs(
    title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
    x= "Review Count Logged",
    y= "Median Annual Household Income")
   #scale_color_gradient(low="darkblue", high="red")
######################################
##convert review_count_log from numeric to character because it is actually categorical
#dFrameCoffee$review_count_log = as.numeric(dFrameCoffee$review_count_log)
######################################
lmod2= lm(review_count_log~pct_pov_log,dFrameCoffee)
summary(lmod2)

## 
## Call:
## lm(formula = review_count_log ~ pct_pov_log, data = dFrameCoffee)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6207 -0.5490  0.0403  0.5993  3.6177 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.85709    0.16733  17.074  < 2e-16 ***
## pct_pov_log -0.26365    0.07255  -3.634 0.000319 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.995 on 361 degrees of freedom
## Multiple R-squared:  0.0353, Adjusted R-squared:  0.03262 
## F-statistic: 13.21 on 1 and 361 DF,  p-value: 0.0003192

typeof(lmod2)

## [1] "list"

pValuePlot2 <- lmp(lmod2)
rValuePlot2 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$pct_pov_log, use = "complete.obs")
typeof(rValuePlot2)

## [1] "double"

print(rValuePlot2)

## [1] -0.1878719

displayPlot2 <- dFrameCoffee |>
  drop_na(review_count_log) |>
  ggplot(aes(review_count_log, pct_pov_log, color = county))+
  geom_smooth(mapping = aes(color = county),
              method = "lm", se = FALSE, formula = y ~ x)+
  annotate("text",x=.5,y=0, label ="R=")+
  annotate("text",x=1.75,y=0, label =rValuePlot2)+
  annotate("text", x= 4, y=0, label = "p=")+
  annotate("text", x= 5.5, y=0, label = pValuePlot2)+
  geom_point()+
  #facet_wrap(~county)+
  theme_bw()+
  labs(
    title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
    x= "Review Count Logged",
    y= "Percent Residents Under Poverty")
   #scale_color_gradient(low="darkblue", high="red")
######################################################################################
lmod3= lm(review_count_log~pct_white,dFrameCoffee)
summary(lmod3)

## 
## Call:
## lm(formula = review_count_log ~ pct_white, data = dFrameCoffee)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7484 -0.5491  0.0531  0.6274  3.5923 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.9113     0.1082  26.916  < 2e-16 ***
## pct_white     1.0655     0.1940   5.491 7.54e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9732 on 361 degrees of freedom
## Multiple R-squared:  0.07709,    Adjusted R-squared:  0.07453 
## F-statistic: 30.15 on 1 and 361 DF,  p-value: 7.537e-08

typeof(lmod3)

## [1] "list"

pValuePlot3 <- lmp(lmod3)
rValuePlot3 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$pct_white, use = "complete.obs")
typeof(rValuePlot3)

## [1] "double"

print(rValuePlot3)

## [1] 0.2776534

displayPlot3 <- dFrameCoffee |>
  drop_na(review_count_log) |>
  ggplot(aes(review_count_log, pct_white, color = county))+
  geom_smooth(mapping = aes(color = county),
              method = "lm", se = FALSE, formula = y ~ x)+
  annotate("text",x=.5,y=.99, label ="R=")+
  annotate("text",x=1.75,y=.99, label =rValuePlot3)+
  annotate("text", x= 4, y=.99, label = "p=")+
  annotate("text", x= 5.5, y=.99, label = pValuePlot3)+
  geom_point()+
  #facet_wrap(~county)+
  theme_bw()+
  labs(
    title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
    x= "Review Count Logged",
    y= "Percent White Resident")
##########################################
lmod4= lm(review_count_log~race.tot,dFrameCoffee)
summary(lmod4)

## 
## Call:
## lm(formula = review_count_log ~ race.tot, data = dFrameCoffee)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7670 -0.5257  0.0425  0.5489  3.7358 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.468e+00  1.150e-01  30.156   <2e-16 ***
## race.tot    -5.199e-06  1.602e-05  -0.324    0.746    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.013 on 361 degrees of freedom
## Multiple R-squared:  0.0002916,  Adjusted R-squared:  -0.002478 
## F-statistic: 0.1053 on 1 and 361 DF,  p-value: 0.7458

typeof(lmod4)

## [1] "list"

pValuePlot4 <- lmp(lmod4)
rValuePlot4 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$race.tot, use = "complete.obs")
typeof(rValuePlot4)

## [1] "double"

print(rValuePlot4)

## [1] -0.01707485

#pValuePlot4 <- lmp(lmod4)
#rValuePlot4 <- as.character(rValuePlot4)
#typeof(rValuePlot4)
#print(rValuePlot4)
displayPlot4 <- dFrameCoffee |>
  drop_na(review_count_log) |>
  ggplot(aes(review_count_log, race.tot, color = county))+
  geom_smooth(mapping = aes(color = county),
              method = "lm", se = FALSE, formula = y ~ x)+
  #stat_cor(label.x = 1, label.y = 20000)+
  annotate("text",x=.5,y=23000, label ="R=")+
  annotate("text",x=2,y=23000, label =rValuePlot4)+
  annotate("text", x= 4, y=23000, label = "p=")+
  annotate("text", x= 5.5, y=23000, label = pValuePlot4)+
  geom_point()+
  #facet_wrap(~county)+
  theme_bw()+
  labs(
    title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
    x= "Review Count Logged",
    y= "Total Population")
displayPlot1

displayPlot2

displayPlot3

displayPlot4

typeof(rValuePlot1)

## [1] "double"

typeof(pValuePlot1)

## [1] "double"

rValuePlot1 = round(rValuePlot1, digits = 3)
pValuePlot1 = round(pValuePlot1, digits = 3)
rValuePlot1

## [1] 0.133

pValuePlot1

## [1] 0.011

d1=displayPlot1 + theme(legend.position="none")
d1= d1 +labs(x="")
d2=displayPlot2 + theme(legend.position="none")
d2= d2 +labs(x="")
d3=displayPlot3
d3= d3 +labs(title="")
d4=displayPlot4
d4= d4 +labs(title="")
#d1 + plot_layout(widths = c(10, 1)) + 
#d2 + plot_layout(widths = c(10, 1)) +
#d3 + plot_layout(widths = c(10, 1)) + plot_spacer() + plot_spacer()+
#d4 + plot_layout(widths = c(10,1), guides = 'collect')
d1/d3

d2/d4

d1= d1 +labs(title="")
d2= d2 +labs(title="")
d3= d3 + theme(legend.position="none")
d3= d3 +labs(x="")
#d4= d4 +labs(x="")

patchwork <- 
  d1+d2+d3+d4
patchwork + plot_annotation(
  title = 'Scatterplot between logged review count & neighborhood characteristics \nUsing Yelp Data from 5 Counties around Atlanta', theme = theme(plot.title = element_text(size = 8)),
  subtitle = '',
  caption = ''
)

mini4

Siddhartha Baghel

2022-10-09

R Markdown