# Group Exercise 1
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
starbucks = fread('./csv/starbucks_drink_nutrition.csv', fill = T)

1.1 [10 pts] Calculate the number of NA values in each column with any apply-family functions,and remove those records (rows) with NA values.

sapply(starbucks,function(x) sum(ifelse(is.na(x),1,0)))
##   Drinks Calories      Fat    Carbs    Fiber  Protein   Sodium 
##        0        1        2        0        1        2        1

#1.2 [10 pts] Remove those records with NA values,and caculate how many records remain.

starbucks_naclear = na.omit(starbucks)

#1.3 [10 pts] Please identify those drink with high calories.List top 5 highest calories beverages.

head(starbucks_naclear[order(starbucks_naclear$Calories,decreasing=TRUE)],5)

#1.4 [10 pts] Please lsit 5 beverages which are high in Portein but low in Calories.

head(starbucks_naclear[order(starbucks_naclear$Protein, -starbucks_naclear$Calories,decreasing = TRUE ), ],5)

###2 Plese load the given “laptop_price.csv” and consider the following questions.

laptop = fread('./csv/laptop_price.csv', fill = T)

laptop$Weight <-  as.numeric(strsplit(laptop$Weight,"kg") )
laptop$Ram <-   as.numeric(strsplit(laptop$Ram,"GB") )

#2.1. [15 pts] Please write an R function my_summary(x) that takes a numeric vector x and computes/returns summary statistics, including mean, standard deviation, max, min, and median. Then use your function to generate these statistics for Ram size, Weight and Price_euros in the dataset. DO NOT use built-in summary().

my_summary <- function(x){
  m <- mean(x)
  sd <- sd(x)
  max <- max(x)
  min <- min(x)
  med <- median(x)
  return (c(m,sd,max,min,med))
  
  my_summary(as.numeric(laptop$Ram))
  my_summary(as.numeric(laptop$Weight))
  my_summary(as.numeric(laptop$Price_euros))

}

#2.2. [20 pts] Please calculate the mean price of the same CPU grouped by CPU Name and save the values in a new variable “avg_price”. Then, list the CPU names and avg price of the top-3 popular CPU. Please do it in both R and SQL.

 laptop %>% 
  group_by(Cpu) %>% summarise(avg_price = mean(Price_euros)) %>% arrange(desc(avg_price))
sqldf::sqldf("select Cpu,avg(Price_euros) as avg_price from laptop group by Cpu order by avg_price desc limit 3") 

#2.3 [15 pts] Visualize the “Weight”, “avg_price”, “Weight by”avg_ price" by using any plots. Does the avg_price appear to be normally distributed? Do you think that the weight of laptop is as associated with the avg vg _price? Use any descriptive statistics techniques and/or statistical methods (e.g. hypothesis testing) to justify your findings.

library(ggplot2)

laptop2 = laptop %>% group_by(Cpu) %>% mutate(avg_price = mean(Price_euros))


ggplot(laptop2,aes(laptop2$Weight,laptop2$avg_price))+geom_point()

chisq.test(laptop2$Weight,laptop2$avg_price)
## Warning in chisq.test(laptop2$Weight, laptop2$avg_price): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  laptop2$Weight and laptop2$avg_price
## X-squared = 27834, df = 19380, p-value < 2.2e-16
#假說h0 weight , avg_price 是獨立的
#p value < 0.05 拒絕h0
#因此weight , avg_price有相關

#2.4. [10 pts] Fit a general linear model and interpret the result. Is there any association between the Weight and avg _price. Do you think that there is any direct causal connection between the Weight and avg_price?

f <- lm(formula = laptop2$Weight ~ laptop2$avg_price,data = laptop2) 
summary(f)
## 
## Call:
## lm(formula = laptop2$Weight ~ laptop2$avg_price, data = laptop2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.44886 -0.45846  0.07165  0.34599  2.34599 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.591e+00  3.888e-02   40.92   <2e-16 ***
## laptop2$avg_price 3.987e-04  3.095e-05   12.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.627 on 1301 degrees of freedom
## Multiple R-squared:  0.1131, Adjusted R-squared:  0.1124 
## F-statistic: 165.9 on 1 and 1301 DF,  p-value: < 2.2e-16
#Weight與Avg_price的解釋力 R-Sqaured = 11% 
#Weight與Avg_price有相關但解釋力僅11%,沒辦法只單靠avg_price解釋Weight,因此還存在其他變數可解釋