library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Dataset:
file <- "C:/Users/JGARCIA/Desktop/ted_main/ted_main.csv"
ted_data <- file %>% read.csv() %>% data.frame()
ncol(ted_data)
## [1] 17
nrow(ted_data)
## [1] 2550
summary(ted_data$views)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50443 755793 1124524 1698297 1700760 47227110
hist(log(ted_data$views),col="green", border="blue")
summary(ted_data$comments)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 63.0 118.0 191.6 221.8 6404.0
hist(log(ted_data$comments),col="mediumspringgreen", border="firebrick1")
Linear Model:
linearmodel<- lm(ted_data$comments ~ ted_data$views)
summary(linearmodel)
##
## Call:
## lm(formula = ted_data$comments ~ ted_data$views)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1168.1 -96.1 -44.2 30.8 6051.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.968e+01 5.730e+00 15.65 <2e-16 ***
## ted_data$views 5.999e-05 1.897e-06 31.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 239.3 on 2548 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2816
## F-statistic: 1000 on 1 and 2548 DF, p-value: < 2.2e-16
plot(ted_data$comments,ted_data$views, col = "orange1")
abline(linearmodel)