# Group Exercise 1
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
starbucks = fread('./csv/starbucks_drink_nutrition.csv', fill = T)
sapply(starbucks,function(x) sum(ifelse(is.na(x),1,0)))
## Drinks Calories Fat Carbs Fiber Protein Sodium
## 0 1 2 0 1 2 1
#1.2 [10 pts] Remove those records with NA values,and caculate how many records remain.
starbucks_naclear = na.omit(starbucks)
#1.3 [10 pts] Please identify those drink with high calories.List top 5 highest calories beverages.
head(starbucks_naclear[order(starbucks_naclear$Calories,decreasing=TRUE)],5)
#1.4 [10 pts] Please lsit 5 beverages which are high in Portein but low in Calories.
head(starbucks_naclear[order(starbucks_naclear$Protein, -starbucks_naclear$Calories,decreasing = TRUE ), ],5)
###2 Plese load the given “laptop_price.csv” and consider the following questions.
laptop = fread('./csv/laptop_price.csv', fill = T)
laptop$Weight <- as.numeric(strsplit(laptop$Weight,"kg") )
laptop$Ram <- as.numeric(strsplit(laptop$Ram,"GB") )
#2.1. [15 pts] Please write an R function my_summary(x) that takes a numeric vector x and computes/returns summary statistics, including mean, standard deviation, max, min, and median. Then use your function to generate these statistics for Ram size, Weight and Price_euros in the dataset. DO NOT use built-in summary().
my_summary <- function(x){
m <- mean(x)
sd <- sd(x)
max <- max(x)
min <- min(x)
med <- median(x)
return (c(m,sd,max,min,med))
my_summary(as.numeric(laptop$Ram))
my_summary(as.numeric(laptop$Weight))
my_summary(as.numeric(laptop$Price_euros))
}
#2.2. [20 pts] Please calculate the mean price of the same CPU grouped by CPU Name and save the values in a new variable “avg_price”. Then, list the CPU names and avg price of the top-3 popular CPU. Please do it in both R and SQL.
laptop %>%
group_by(Cpu) %>% summarise(avg_price = mean(Price_euros)) %>% arrange(desc(avg_price))
sqldf::sqldf("select Cpu,avg(Price_euros) as avg_price from laptop group by Cpu order by avg_price desc limit 3")
#2.3 [15 pts] Visualize the “Weight”, “avg_price”, “Weight by”avg_ price" by using any plots. Does the avg_price appear to be normally distributed? Do you think that the weight of laptop is as associated with the avg vg _price? Use any descriptive statistics techniques and/or statistical methods (e.g. hypothesis testing) to justify your findings.
library(ggplot2)
laptop2 = laptop %>% group_by(Cpu) %>% mutate(avg_price = mean(Price_euros))
ggplot(laptop2,aes(laptop2$Weight,laptop2$avg_price))+geom_point()
chisq.test(laptop2$Weight,laptop2$avg_price)
## Warning in chisq.test(laptop2$Weight, laptop2$avg_price): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: laptop2$Weight and laptop2$avg_price
## X-squared = 27834, df = 19380, p-value < 2.2e-16
#假說h0 weight , avg_price 是獨立的
#p value < 0.05 拒絕h0
#因此weight , avg_price有相關
#2.4. [10 pts] Fit a general linear model and interpret the result. Is there any association between the Weight and avg _price. Do you think that there is any direct causal connection between the Weight and avg_price?
f <- lm(formula = laptop2$Weight ~ laptop2$avg_price,data = laptop2)
summary(f)
##
## Call:
## lm(formula = laptop2$Weight ~ laptop2$avg_price, data = laptop2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44886 -0.45846 0.07165 0.34599 2.34599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.591e+00 3.888e-02 40.92 <2e-16 ***
## laptop2$avg_price 3.987e-04 3.095e-05 12.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.627 on 1301 degrees of freedom
## Multiple R-squared: 0.1131, Adjusted R-squared: 0.1124
## F-statistic: 165.9 on 1 and 1301 DF, p-value: < 2.2e-16
#Weight與Avg_price的解釋力 R-Sqaured = 11%
#Weight與Avg_price有相關但解釋力僅11%,沒辦法只單靠avg_price解釋Weight,因此還存在其他變數可解釋