This is an R Markdown document for Mini Assignment 4. Need to round r and p values for last chart.
#Read csv file from source and create new data frame called dFrameCoffee
dFrameCoffee <-
read.csv("https://bonwookoo.github.io/UrbanAnalytics2022/Assignment/mini_4/coffee.csv")
#Check to see if dFrameCoffee is in fact a data frame
class(dFrameCoffee)
## [1] "data.frame"
#View columns of the dataframe
names(dFrameCoffee)
## [1] "X" "GEOID" "county" "hhincome"
## [5] "pct_pov" "review_count" "avg_rating" "race.tot"
## [9] "avg_price" "pct_white" "hhincome_log" "review_count_log"
## [13] "pct_pov_log" "yelp_n"
#convert ratings from character for ratings to be categorical
dFrameCoffee$avg_rating = as.character(dFrameCoffee$avg_rating)
##convert review_count_log to numeric
dFrameCoffee$review_count_log = as.numeric(dFrameCoffee$review_count_log)
#Function used to extract p value from lm
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
#END TEST EXTRACT FUNCTION
#draw the boxplot using ggplot
dFrameCoffee |>
drop_na(avg_rating) |>
ggplot(aes(avg_rating, hhincome))+
geom_boxplot()+
theme_bw()+
labs(title = "hhincome vs avg_rating")
#facetwrap to divide the data into a specific category in this case counties
dFrameCoffee |>
drop_na(avg_rating) |>
ggplot(aes(avg_rating, hhincome))+
geom_boxplot()+
facet_wrap(~county)+
theme_bw()+
labs(title = "hhincome vs avg_rating", x = "Average Yelp Rating", y= "Median Household Income")
#facetwrap to divide the data into a specific category in this case counties
dFrameCoffee |>
drop_na(review_count_log) |>
ggplot(aes(review_count_log, hhincome))+
geom_point(aes(color = pct_white, size = 1, alpha = 0.5))+
facet_wrap(~county)+
theme_bw()+
labs(
title = "Scatterplot: Review Count vs Household Income",
x= "Review Count (Log)",
y= "Median Annual Household Income")+
scale_color_gradient(low="darkblue", high="red")
#facetwrap to divide the data into a specific category in this case counties
lmod1= lm(review_count_log~hhincome,dFrameCoffee)
summary(lmod1)
##
## Call:
## lm(formula = review_count_log ~ hhincome, data = dFrameCoffee)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6426 -0.5533 0.0529 0.6004 3.6950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.144e+00 1.256e-01 25.023 <2e-16 ***
## hhincome 3.719e-06 1.456e-06 2.554 0.0111 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.004 on 361 degrees of freedom
## Multiple R-squared: 0.01774, Adjusted R-squared: 0.01502
## F-statistic: 6.521 on 1 and 361 DF, p-value: 0.01107
typeof(lmod1)
## [1] "list"
pValuePlot1 <- lmp(lmod1)
rValuePlot1 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$hhincome, use = "complete.obs")
typeof(rValuePlot1)
## [1] "double"
print(rValuePlot1)
## [1] 0.1332059
displayPlot1 <- dFrameCoffee |>
drop_na(review_count_log) |>
ggplot(aes(review_count_log, hhincome, color = county))+
geom_point()+
geom_smooth(mapping = aes(color = county),
method = "lm", se = FALSE, formula = y ~ x)+
annotate("text",x=.5,y=200000, label ="R=")+
annotate("text",x=2,y=200000, label =rValuePlot1)+
annotate("text", x= 4, y=200000, label = "p=")+
annotate("text", x= 5.5, y=200000, label = pValuePlot1)+
#facet_wrap(~county)+
theme_bw()+
labs(
title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
x= "Review Count Logged",
y= "Median Annual Household Income")
#scale_color_gradient(low="darkblue", high="red")
######################################
##convert review_count_log from numeric to character because it is actually categorical
#dFrameCoffee$review_count_log = as.numeric(dFrameCoffee$review_count_log)
######################################
lmod2= lm(review_count_log~pct_pov_log,dFrameCoffee)
summary(lmod2)
##
## Call:
## lm(formula = review_count_log ~ pct_pov_log, data = dFrameCoffee)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6207 -0.5490 0.0403 0.5993 3.6177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.85709 0.16733 17.074 < 2e-16 ***
## pct_pov_log -0.26365 0.07255 -3.634 0.000319 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.995 on 361 degrees of freedom
## Multiple R-squared: 0.0353, Adjusted R-squared: 0.03262
## F-statistic: 13.21 on 1 and 361 DF, p-value: 0.0003192
typeof(lmod2)
## [1] "list"
pValuePlot2 <- lmp(lmod2)
rValuePlot2 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$pct_pov_log, use = "complete.obs")
typeof(rValuePlot2)
## [1] "double"
print(rValuePlot2)
## [1] -0.1878719
displayPlot2 <- dFrameCoffee |>
drop_na(review_count_log) |>
ggplot(aes(review_count_log, pct_pov_log, color = county))+
geom_smooth(mapping = aes(color = county),
method = "lm", se = FALSE, formula = y ~ x)+
annotate("text",x=.5,y=0, label ="R=")+
annotate("text",x=1.75,y=0, label =rValuePlot2)+
annotate("text", x= 4, y=0, label = "p=")+
annotate("text", x= 5.5, y=0, label = pValuePlot2)+
geom_point()+
#facet_wrap(~county)+
theme_bw()+
labs(
title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
x= "Review Count Logged",
y= "Percent Residents Under Poverty")
#scale_color_gradient(low="darkblue", high="red")
######################################################################################
lmod3= lm(review_count_log~pct_white,dFrameCoffee)
summary(lmod3)
##
## Call:
## lm(formula = review_count_log ~ pct_white, data = dFrameCoffee)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7484 -0.5491 0.0531 0.6274 3.5923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9113 0.1082 26.916 < 2e-16 ***
## pct_white 1.0655 0.1940 5.491 7.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9732 on 361 degrees of freedom
## Multiple R-squared: 0.07709, Adjusted R-squared: 0.07453
## F-statistic: 30.15 on 1 and 361 DF, p-value: 7.537e-08
typeof(lmod3)
## [1] "list"
pValuePlot3 <- lmp(lmod3)
rValuePlot3 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$pct_white, use = "complete.obs")
typeof(rValuePlot3)
## [1] "double"
print(rValuePlot3)
## [1] 0.2776534
displayPlot3 <- dFrameCoffee |>
drop_na(review_count_log) |>
ggplot(aes(review_count_log, pct_white, color = county))+
geom_smooth(mapping = aes(color = county),
method = "lm", se = FALSE, formula = y ~ x)+
annotate("text",x=.5,y=.99, label ="R=")+
annotate("text",x=1.75,y=.99, label =rValuePlot3)+
annotate("text", x= 4, y=.99, label = "p=")+
annotate("text", x= 5.5, y=.99, label = pValuePlot3)+
geom_point()+
#facet_wrap(~county)+
theme_bw()+
labs(
title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
x= "Review Count Logged",
y= "Percent White Resident")
##########################################
lmod4= lm(review_count_log~race.tot,dFrameCoffee)
summary(lmod4)
##
## Call:
## lm(formula = review_count_log ~ race.tot, data = dFrameCoffee)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7670 -0.5257 0.0425 0.5489 3.7358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.468e+00 1.150e-01 30.156 <2e-16 ***
## race.tot -5.199e-06 1.602e-05 -0.324 0.746
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.013 on 361 degrees of freedom
## Multiple R-squared: 0.0002916, Adjusted R-squared: -0.002478
## F-statistic: 0.1053 on 1 and 361 DF, p-value: 0.7458
typeof(lmod4)
## [1] "list"
pValuePlot4 <- lmp(lmod4)
rValuePlot4 <- cor(dFrameCoffee$review_count_log, dFrameCoffee$race.tot, use = "complete.obs")
typeof(rValuePlot4)
## [1] "double"
print(rValuePlot4)
## [1] -0.01707485
#pValuePlot4 <- lmp(lmod4)
#rValuePlot4 <- as.character(rValuePlot4)
#typeof(rValuePlot4)
#print(rValuePlot4)
displayPlot4 <- dFrameCoffee |>
drop_na(review_count_log) |>
ggplot(aes(review_count_log, race.tot, color = county))+
geom_smooth(mapping = aes(color = county),
method = "lm", se = FALSE, formula = y ~ x)+
#stat_cor(label.x = 1, label.y = 20000)+
annotate("text",x=.5,y=23000, label ="R=")+
annotate("text",x=2,y=23000, label =rValuePlot4)+
annotate("text", x= 4, y=23000, label = "p=")+
annotate("text", x= 5.5, y=23000, label = pValuePlot4)+
geom_point()+
#facet_wrap(~county)+
theme_bw()+
labs(
title = "Scatterplot between Logged Review Count and Neighborhood Characteristics",
x= "Review Count Logged",
y= "Total Population")
displayPlot1
displayPlot2
displayPlot3
displayPlot4
typeof(rValuePlot1)
## [1] "double"
typeof(pValuePlot1)
## [1] "double"
rValuePlot1 = round(rValuePlot1, digits = 3)
pValuePlot1 = round(pValuePlot1, digits = 3)
rValuePlot1
## [1] 0.133
pValuePlot1
## [1] 0.011
d1=displayPlot1 + theme(legend.position="none")
d1= d1 +labs(x="")
d2=displayPlot2 + theme(legend.position="none")
d2= d2 +labs(x="")
d3=displayPlot3
d3= d3 +labs(title="")
d4=displayPlot4
d4= d4 +labs(title="")
#d1 + plot_layout(widths = c(10, 1)) +
#d2 + plot_layout(widths = c(10, 1)) +
#d3 + plot_layout(widths = c(10, 1)) + plot_spacer() + plot_spacer()+
#d4 + plot_layout(widths = c(10,1), guides = 'collect')
d1/d3
d2/d4
d1= d1 +labs(title="")
d2= d2 +labs(title="")
d3= d3 + theme(legend.position="none")
d3= d3 +labs(x="")
#d4= d4 +labs(x="")
patchwork <-
d1+d2+d3+d4
patchwork + plot_annotation(
title = 'Scatterplot between logged review count & neighborhood characteristics \nUsing Yelp Data from 5 Counties around Atlanta', theme = theme(plot.title = element_text(size = 8)),
subtitle = '',
caption = ''
)