For loading data sets first we have to download Package “dslabs” then we run code
library(dslabs)
## Warning: package 'dslabs' was built under R version 4.2.3
data("murders")
state<-c(murders$state)
state
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District of Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New Hampshire"
## [31] "New Jersey" "New Mexico" "New York"
## [34] "North Carolina" "North Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode Island" "South Carolina" "South Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "West Virginia" "Wisconsin" "Wyoming"
abb<-c(murders$abb)
abb
## [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN"
## [16] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH"
## [31] "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VT" "VA" "WA" "WV" "WI" "WY"
total_population_usa<-sum(murders$population)
total_population_usa
## [1] 309864228
total_murders<-sum(murders$total)
total_murders
## [1] 9403
highest_popuation_state<-murders$state[which.max(murders$population)]
highest_popuation_state
## [1] "California"
highest_no_of_murders<-murders$state[which.max(murders$total)]
highest_no_of_murders
## [1] "California"
lowest_no_of_murders<-murders$state[which.min(murders$total)]
lowest_no_of_murders
## [1] "Vermont"
lowest_no_of_population<-murders$state[which.min(murders$population)]
lowest_no_of_population
## [1] "Wyoming"
washington_population<-murders$population[murders$state=='Washington']
cat(washington_population)
## 6724540
no_of_murders_at_Alaska<-murders$total [murders$state=='Alaska'
]
no_of_murders_at_Alaska
## [1] 19
Correlation is a statistical measure that describes the degree to which two or more variables are related or associated with each other. It quantifies the strength and direction of the linear relationship between these variables.
\[ r = \frac{{\sum{(x_i - \bar{x})(y_i - \bar{y})}}}{{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}}} \]
x<- c(murders$population)
x
## [1] 4779736 710231 6392017 2915918 37253956 5029196 3574097 897934
## [9] 601723 19687653 9920000 1360301 1567582 12830632 6483802 3046355
## [17] 2853118 4339367 4533372 1328361 5773552 6547629 9883640 5303925
## [25] 2967297 5988927 989415 1826341 2700551 1316470 8791894 2059179
## [33] 19378102 9535483 672591 11536504 3751351 3831074 12702379 1052567
## [41] 4625364 814180 6346105 25145561 2763885 625741 8001024 6724540
## [49] 1852994 5686986 563626
y<- c(murders$total)
y
## [1] 135 19 232 93 1257 65 97 38 99 669 376 7 12 364 142
## [16] 21 63 116 351 11 293 118 413 53 120 321 12 32 84 5
## [31] 246 67 517 286 4 310 111 36 457 16 207 8 219 805 22
## [46] 2 250 93 27 97 5
xy <- x * y
xy
## [1] 645264360 13494389 1482947944 271180374 46828222692 326897740
## [7] 346687409 34121492 59570577 13171039857 3729920000 9522107
## [13] 18810984 4670350048 920699884 63973455 179746434 503366572
## [19] 1591213572 14611971 1691650736 772620222 4081943320 281108025
## [25] 356075640 1922445567 11872980 58442912 226846284 6582350
## [31] 2162805924 137964993 10018478734 2727148138 2690364 3576316240
## [37] 416399961 137918664 5804987203 16841072 957450348 6513440
## [43] 1389796995 20242176605 60805470 1251482 2000256000 625382220
## [49] 50030838 551637642 2818130
sum_xy<-sum(xy)
sum_xy
## [1] 135180900360
n<-length(x)
n
## [1] 51
n*(sum_xy)
## [1] 6.894226e+12
sum_x<-sum(x)
sum_x
## [1] 309864228
sum_y<-sum(y)
sum_y
## [1] 9403
sum_x*sum_y
## [1] 2.913653e+12
num<-n*(sum_xy)-(sum_x)*(sum_y)
num
## [1] 3.980573e+12
x_sq<-x^2
x_sq
## [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
## [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11
sum_x_sq<- sum(x_sq)
sum_x_sq
## [1] 4.236103e+15
n*sum_x_sq
## [1] 2.160412e+17
sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16
(n*sum_x_sq)-(sq_sum_x)
## [1] 1.200254e+17
y_sq<-y^2
y_sq
## [1] 18225 361 53824 8649 1580049 4225 9409 1444 9801
## [10] 447561 141376 49 144 132496 20164 441 3969 13456
## [19] 123201 121 85849 13924 170569 2809 14400 103041 144
## [28] 1024 7056 25 60516 4489 267289 81796 16 96100
## [37] 12321 1296 208849 256 42849 64 47961 648025 484
## [46] 4 62500 8649 729 9409 25
sum_y_sq<-sum(y_sq)
sum_y_sq
## [1] 4521433
n*sum_y_sq
## [1] 230593083
sq_sum_y<-(sum_y)^2
sq_sum_y
## [1] 88416409
(n*sum_y_sq)-(sq_sum_y)
## [1] 142176674
deno<-((n*sum_x_sq)-(sq_sum_x))*((n*sum_y_sq)-(sq_sum_y))
deno
## [1] 1.706481e+25
sq_root_deno<-deno^0.5
sq_root_deno
## [1] 4.130958e+12
r<-num/sq_root_deno
r
## [1] 0.9635956
cor(x,y)
## [1] 0.9635956
The correlation of population on the number of murders in the dataset of murders in R is 0.96. This means that there is a strong positive correlation between the two variables. This means that as the population of a state increases, the number of murders in that state also tends to increase........
A regression line, also known as a "best-fit line" or a regression equation, is a straight line that represents the relationship between two variables in a statistical analysis, typically in the context of linear regression. It is used to model the relationship between a dependent variable (also known as the response variable) and one or more independent variables (also known as predictor variables or features). The primary goal of a regression line is to provide an estimate of the dependent variable based on the values of the independent variable(s).
\[ Y = a + bX + \epsilon \]
x<-c(murders$population)#x
y<-c(murders$total)#y
sum_x<-sum(x)
sum_x
## [1] 309864228
n<-length(x)
n
## [1] 51
sum_y<-sum(y)
sum_y
## [1] 9403
n<-length(x)
n
## [1] 51
xy<-x*y
xy
## [1] 645264360 13494389 1482947944 271180374 46828222692 326897740
## [7] 346687409 34121492 59570577 13171039857 3729920000 9522107
## [13] 18810984 4670350048 920699884 63973455 179746434 503366572
## [19] 1591213572 14611971 1691650736 772620222 4081943320 281108025
## [25] 356075640 1922445567 11872980 58442912 226846284 6582350
## [31] 2162805924 137964993 10018478734 2727148138 2690364 3576316240
## [37] 416399961 137918664 5804987203 16841072 957450348 6513440
## [43] 1389796995 20242176605 60805470 1251482 2000256000 625382220
## [49] 50030838 551637642 2818130
sum_xy<-sum(xy)
sum_xy
## [1] 135180900360
sum(xy)*n
## [1] 6.894226e+12
sum(x)*sum(y)
## [1] 2.913653e+12
num<-(sum(xy)*n)-(sum(x)*sum(y))
num
## [1] 3.980573e+12
sq_x<-x^2
sq_x
## [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
## [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11
sum_sq_x<-sum(sq_x)
sum_sq_x
## [1] 4.236103e+15
n*sum_sq_x
## [1] 2.160412e+17
sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16
deno<-(n*sum_sq_x)-(sq_sum_x)
deno
## [1] 1.200254e+17
b<-num/deno
b
## [1] 3.316442e-05
mean_x<- sum_x/n
mean_x #or
## [1] 6075769
mean(x)
## [1] 6075769
mean_y<-sum_y/n
a<-(mean_y-b*mean_x)
a
## [1] -17.12682
y<-a+b*x
cat(y)
## 141.3904 6.42758 194.8607 79.57791 1218.379 149.6636 101.406 12.65264 2.828975 635.8028 311.8642 27.98678 34.86113 408.3937 197.9047 83.90378 77.49519 126.7858 133.2198 26.9275 174.3497 200.0215 310.6584 158.7748 81.28187 181.4925 15.68656 43.44272 72.43539 26.53315 274.4513 51.16466 625.5367 299.112 5.179271 365.4747 107.2846 109.9285 404.1402 17.78096 136.2707 9.874988 193.3381 816.8112 74.53583 3.625518 248.2225 205.8887 44.32665 171.4788 1.56551
x <- c(murders$population)
y <- c(murders$total)
model <- lm(y ~ x)
a <- coef(model)[1]
b <- coef(model)[2]
cat("Regression Equation: y =", a, "+", b, "x\n")
## Regression Equation: y = -17.12682 + 3.316442e-05 x