This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dslabs)
You can also embed plots, for example:
data(murders)
head(murders)
## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
head(murders,10)
## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
## 7 Connecticut CT Northeast 3574097 97
## 8 Delaware DE South 897934 38
## 9 District of Columbia DC South 601723 99
## 10 Florida FL South 19687653 669
#for viewing last 6 rows of dataset
tail(murders)
## state abb region population total
## 46 Vermont VT Northeast 625741 2
## 47 Virginia VA South 8001024 250
## 48 Washington WA West 6724540 93
## 49 West Virginia WV South 1852994 27
## 50 Wisconsin WI North Central 5686986 97
## 51 Wyoming WY West 563626 5
#for structure
str(murders)
## 'data.frame': 51 obs. of 5 variables:
## $ state : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ abb : chr "AL" "AK" "AZ" "AR" ...
## $ region : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
## $ population: num 4779736 710231 6392017 2915918 37253956 ...
## $ total : num 135 19 232 93 1257 ...
colnames(murders)
## [1] "state" "abb" "region" "population" "total"
#q1: store state and abbreviaation name in seperate vector
state<-c(murders$state)
state
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District of Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New Hampshire"
## [31] "New Jersey" "New Mexico" "New York"
## [34] "North Carolina" "North Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode Island" "South Carolina" "South Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "West Virginia" "Wisconsin" "Wyoming"
abb<-c(murders$abb)
abb
## [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN"
## [16] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH"
## [31] "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VT" "VA" "WA" "WV" "WI" "WY"
#q2: what is the total population of USA?
total_population_usa<-sum(murders$population)
total_population_usa
## [1] 309864228
#q3: how many murders havve been done according to this dataset
total_murders<-sum(murders$total)
total_murders
## [1] 9403
highest_popuation_state<-murders$state[which.max(murders$population)]
highest_popuation_state
## [1] "California"
#q5:which state has highest no of murders?
highest_no_of_murders<-murders$state[which.max(murders$total)]
highest_no_of_murders
## [1] "California"
#q6: which state has lowest no of murders?
lowest_no_of_murders<-murders$state[which.min(murders$total)]
lowest_no_of_murders
## [1] "Vermont"
#q7: which state has lowest no of population?
lowest_no_of_population<-murders$state[which.min(murders$population)]
lowest_no_of_population
## [1] "Wyoming"
#q8:compute correlation between population and no of murders ###correlation: #The correlation coefficient is a statistical concept which helps in establishing a relation between predicted and actual values obtained in a statistical experiment.# ###formula \[ r = \frac{n \sum{xy} - (\sum{x})(\sum{y})}{\sqrt{[n\sum{x^2} - (\sum{x})^2][n\sum{y^2} - (\sum{y})^2]}} \] # vector declaration
x<- c(murders$population)
x
## [1] 4779736 710231 6392017 2915918 37253956 5029196 3574097 897934
## [9] 601723 19687653 9920000 1360301 1567582 12830632 6483802 3046355
## [17] 2853118 4339367 4533372 1328361 5773552 6547629 9883640 5303925
## [25] 2967297 5988927 989415 1826341 2700551 1316470 8791894 2059179
## [33] 19378102 9535483 672591 11536504 3751351 3831074 12702379 1052567
## [41] 4625364 814180 6346105 25145561 2763885 625741 8001024 6724540
## [49] 1852994 5686986 563626
y<- c(murders$total)
y
## [1] 135 19 232 93 1257 65 97 38 99 669 376 7 12 364 142
## [16] 21 63 116 351 11 293 118 413 53 120 321 12 32 84 5
## [31] 246 67 517 286 4 310 111 36 457 16 207 8 219 805 22
## [46] 2 250 93 27 97 5
xy <- x * y
xy
## [1] 645264360 13494389 1482947944 271180374 46828222692 326897740
## [7] 346687409 34121492 59570577 13171039857 3729920000 9522107
## [13] 18810984 4670350048 920699884 63973455 179746434 503366572
## [19] 1591213572 14611971 1691650736 772620222 4081943320 281108025
## [25] 356075640 1922445567 11872980 58442912 226846284 6582350
## [31] 2162805924 137964993 10018478734 2727148138 2690364 3576316240
## [37] 416399961 137918664 5804987203 16841072 957450348 6513440
## [43] 1389796995 20242176605 60805470 1251482 2000256000 625382220
## [49] 50030838 551637642 2818130
#sum of xy
sum_xy<-sum(xy)
sum_xy
## [1] 135180900360
#n*(sum of xy)
n<-length(x)
n
## [1] 51
n*(sum_xy)
## [1] 6.894226e+12
#sum x
sum_x<-sum(x)
sum_x
## [1] 309864228
#sum y
sum_y<-sum(y)
sum_y
## [1] 9403
#sum of x * sum of y
sum_x*sum_y
## [1] 2.913653e+12
#numerator
num<-n*(sum_xy)-(sum_x)*(sum_y)
num
## [1] 3.980573e+12
#sq of x
x_sq<-x^2
x_sq
## [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
## [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11
#sum of sq of x
sum_x_sq<-sum(x_sq)
sum_x_sq
## [1] 4.236103e+15
#n*sum of sq of x
n*sum_x_sq
## [1] 2.160412e+17
#sq of sum x
sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16
(n*sum_x_sq)-(sq_sum_x)
## [1] 1.200254e+17
y_sq<-y^2
y_sq
## [1] 18225 361 53824 8649 1580049 4225 9409 1444 9801
## [10] 447561 141376 49 144 132496 20164 441 3969 13456
## [19] 123201 121 85849 13924 170569 2809 14400 103041 144
## [28] 1024 7056 25 60516 4489 267289 81796 16 96100
## [37] 12321 1296 208849 256 42849 64 47961 648025 484
## [46] 4 62500 8649 729 9409 25
sum_y_sq<-sum(y_sq)
sum_y_sq
## [1] 4521433
#n*sum of sq of y
n*sum_y_sq
## [1] 230593083
sq_sum_y<-(sum_y)^2
sq_sum_y
## [1] 88416409
(n*sum_y_sq)-(sq_sum_y)
## [1] 142176674
deno<-((n*sum_x_sq)-(sq_sum_x))*((n*sum_y_sq)-(sq_sum_y))
deno
## [1] 1.706481e+25
sq_root_deno<-deno^0.5
sq_root_deno
## [1] 4.130958e+12
r<-num/sq_root_deno
r
## [1] 0.9635956
#OR
cor(x,y)
## [1] 0.9635956
#q9: fit regression line of population on no of murders ## regression: #Regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables.# formula: \[{\sum y}/{n} = a + b \frac{\sum x}{n}\] \[b \frac{\sum xy - (\sum x)(\sum y)}{\sum x^2 - (\sum x)^2}\] \[a = \frac{\sum y}{n} - b \frac{\sum x}{n}\]
x<-c(murders$population)#x
y<-c(murders$total)#y
#sumof age
sum_x<-sum(x)
sum_x
## [1] 309864228
n<-length(x)
n
## [1] 51
sum_y<-sum(y)
sum_y
## [1] 9403
n<-length(x)
n
## [1] 51
xy<-x*y
xy
## [1] 645264360 13494389 1482947944 271180374 46828222692 326897740
## [7] 346687409 34121492 59570577 13171039857 3729920000 9522107
## [13] 18810984 4670350048 920699884 63973455 179746434 503366572
## [19] 1591213572 14611971 1691650736 772620222 4081943320 281108025
## [25] 356075640 1922445567 11872980 58442912 226846284 6582350
## [31] 2162805924 137964993 10018478734 2727148138 2690364 3576316240
## [37] 416399961 137918664 5804987203 16841072 957450348 6513440
## [43] 1389796995 20242176605 60805470 1251482 2000256000 625382220
## [49] 50030838 551637642 2818130
#sum of xy
sum_xy<-sum(xy)
sum_xy
## [1] 135180900360
#n*sum of xy
sum(xy)*n
## [1] 6.894226e+12
sum(x)*sum(y)
## [1] 2.913653e+12
num<-(sum(xy)*n)-(sum(x)*sum(y))
num
## [1] 3.980573e+12
#sq of x
sq_x<-x^2
sq_x
## [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
## [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11
#sum of sq of x
sum_sq_x<-sum(sq_x)
sum_sq_x
## [1] 4.236103e+15
#n*sum of x sq
n*sum_sq_x
## [1] 2.160412e+17
sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16
deno<-(n*sum_sq_x)-(sq_sum_x)
deno
## [1] 1.200254e+17
#b
b<-num/deno
b
## [1] 3.316442e-05
mean_x<- sum_x/n
mean_x
## [1] 6075769
mean_y<-sum_y/n
#a
a<-(mean_y-b*mean_x)
a
## [1] -17.12682
#OR # Example data
x <- c(murders$population)
y <- c(murders$total)
model <- lm(y ~ x)
a <- coef(model)[1]
b <- coef(model)[2]
cat("Regression Equation: y =", a, "+", b, "x\n")
## Regression Equation: y = -17.12682 + 3.316442e-05 x
washington_population<-murders$population[murders$state=='Washington']
print(washington_population)
## [1] 6724540
no_of_murders_at_Atlas<-murders$total[murders$state=='Atlas']
no_of_murders_at_Atlas
## numeric(0)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.