library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Dataset:

file <- "C:/Users/JGARCIA/Desktop/ted_main/ted_main.csv"
ted_data <- file %>% read.csv() %>% data.frame()
ncol(ted_data)
## [1] 17
nrow(ted_data)
## [1] 2550
summary(ted_data$views)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    50443   755793  1124524  1698297  1700760 47227110
hist(log(ted_data$views),col="green", border="blue")

summary(ted_data$comments)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    63.0   118.0   191.6   221.8  6404.0
hist(log(ted_data$comments),col="mediumspringgreen", border="firebrick1")

Linear Model:

linearmodel<- lm(ted_data$comments ~ ted_data$views)
summary(linearmodel)
## 
## Call:
## lm(formula = ted_data$comments ~ ted_data$views)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1168.1   -96.1   -44.2    30.8  6051.9 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.968e+01  5.730e+00   15.65   <2e-16 ***
## ted_data$views 5.999e-05  1.897e-06   31.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 239.3 on 2548 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2816 
## F-statistic:  1000 on 1 and 2548 DF,  p-value: < 2.2e-16
plot(ted_data$comments,ted_data$views, col = "orange1")
abline(linearmodel)