R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dslabs)

Including Plots

You can also embed plots, for example:

data(murders)

for viewing first 6 rows of murders datset

head(murders)
##        state abb region population total
## 1    Alabama  AL  South    4779736   135
## 2     Alaska  AK   West     710231    19
## 3    Arizona  AZ   West    6392017   232
## 4   Arkansas  AR  South    2915918    93
## 5 California  CA   West   37253956  1257
## 6   Colorado  CO   West    5029196    65

for viewing first 6 rows of murders datset

head(murders,10)
##                   state abb    region population total
## 1               Alabama  AL     South    4779736   135
## 2                Alaska  AK      West     710231    19
## 3               Arizona  AZ      West    6392017   232
## 4              Arkansas  AR     South    2915918    93
## 5            California  CA      West   37253956  1257
## 6              Colorado  CO      West    5029196    65
## 7           Connecticut  CT Northeast    3574097    97
## 8              Delaware  DE     South     897934    38
## 9  District of Columbia  DC     South     601723    99
## 10              Florida  FL     South   19687653   669

#for viewing last 6 rows of dataset

tail(murders)
##            state abb        region population total
## 46       Vermont  VT     Northeast     625741     2
## 47      Virginia  VA         South    8001024   250
## 48    Washington  WA          West    6724540    93
## 49 West Virginia  WV         South    1852994    27
## 50     Wisconsin  WI North Central    5686986    97
## 51       Wyoming  WY          West     563626     5

#for structure

str(murders)
## 'data.frame':    51 obs. of  5 variables:
##  $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ abb       : chr  "AL" "AK" "AZ" "AR" ...
##  $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
##  $ population: num  4779736 710231 6392017 2915918 37253956 ...
##  $ total     : num  135 19 232 93 1257 ...

for showing column names

colnames(murders)
## [1] "state"      "abb"        "region"     "population" "total"

#q1: store state and abbreviaation name in seperate vector

state<-c(murders$state)
state
##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "Arkansas"             "California"           "Colorado"            
##  [7] "Connecticut"          "Delaware"             "District of Columbia"
## [10] "Florida"              "Georgia"              "Hawaii"              
## [13] "Idaho"                "Illinois"             "Indiana"             
## [16] "Iowa"                 "Kansas"               "Kentucky"            
## [19] "Louisiana"            "Maine"                "Maryland"            
## [22] "Massachusetts"        "Michigan"             "Minnesota"           
## [25] "Mississippi"          "Missouri"             "Montana"             
## [28] "Nebraska"             "Nevada"               "New Hampshire"       
## [31] "New Jersey"           "New Mexico"           "New York"            
## [34] "North Carolina"       "North Dakota"         "Ohio"                
## [37] "Oklahoma"             "Oregon"               "Pennsylvania"        
## [40] "Rhode Island"         "South Carolina"       "South Dakota"        
## [43] "Tennessee"            "Texas"                "Utah"                
## [46] "Vermont"              "Virginia"             "Washington"          
## [49] "West Virginia"        "Wisconsin"            "Wyoming"
abb<-c(murders$abb)
abb
##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN"
## [16] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH"
## [31] "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VT" "VA" "WA" "WV" "WI" "WY"

#q2: what is the total population of USA?

total_population_usa<-sum(murders$population)
total_population_usa
## [1] 309864228

#q3: how many murders havve been done according to this dataset

total_murders<-sum(murders$total)
total_murders
## [1] 9403

q4:which state has highest population?

highest_popuation_state<-murders$state[which.max(murders$population)]
highest_popuation_state
## [1] "California"

#q5:which state has highest no of murders?

highest_no_of_murders<-murders$state[which.max(murders$total)]
highest_no_of_murders
## [1] "California"

#q6: which state has lowest no of murders?

lowest_no_of_murders<-murders$state[which.min(murders$total)]
lowest_no_of_murders
## [1] "Vermont"

#q7: which state has lowest no of population?

lowest_no_of_population<-murders$state[which.min(murders$population)]
lowest_no_of_population
## [1] "Wyoming"

#q8:compute correlation between population and no of murders ###correlation: #The correlation coefficient is a statistical concept which helps in establishing a relation between predicted and actual values obtained in a statistical experiment.# ###formula \[ r = \frac{n \sum{xy} - (\sum{x})(\sum{y})}{\sqrt{[n\sum{x^2} - (\sum{x})^2][n\sum{y^2} - (\sum{y})^2]}} \] # vector declaration

x<- c(murders$population)
x
##  [1]  4779736   710231  6392017  2915918 37253956  5029196  3574097   897934
##  [9]   601723 19687653  9920000  1360301  1567582 12830632  6483802  3046355
## [17]  2853118  4339367  4533372  1328361  5773552  6547629  9883640  5303925
## [25]  2967297  5988927   989415  1826341  2700551  1316470  8791894  2059179
## [33] 19378102  9535483   672591 11536504  3751351  3831074 12702379  1052567
## [41]  4625364   814180  6346105 25145561  2763885   625741  8001024  6724540
## [49]  1852994  5686986   563626
y<- c(murders$total)
y
##  [1]  135   19  232   93 1257   65   97   38   99  669  376    7   12  364  142
## [16]   21   63  116  351   11  293  118  413   53  120  321   12   32   84    5
## [31]  246   67  517  286    4  310  111   36  457   16  207    8  219  805   22
## [46]    2  250   93   27   97    5

xy

xy <- x * y
xy
##  [1]   645264360    13494389  1482947944   271180374 46828222692   326897740
##  [7]   346687409    34121492    59570577 13171039857  3729920000     9522107
## [13]    18810984  4670350048   920699884    63973455   179746434   503366572
## [19]  1591213572    14611971  1691650736   772620222  4081943320   281108025
## [25]   356075640  1922445567    11872980    58442912   226846284     6582350
## [31]  2162805924   137964993 10018478734  2727148138     2690364  3576316240
## [37]   416399961   137918664  5804987203    16841072   957450348     6513440
## [43]  1389796995 20242176605    60805470     1251482  2000256000   625382220
## [49]    50030838   551637642     2818130

#sum of xy

sum_xy<-sum(xy)
sum_xy
## [1] 135180900360

#n*(sum of xy)

n<-length(x)
n
## [1] 51
n*(sum_xy)
## [1] 6.894226e+12

#sum x

sum_x<-sum(x)
sum_x
## [1] 309864228

#sum y

sum_y<-sum(y)
sum_y
## [1] 9403

#sum of x * sum of y

sum_x*sum_y
## [1] 2.913653e+12

#numerator

num<-n*(sum_xy)-(sum_x)*(sum_y)
num
## [1] 3.980573e+12

#sq of x

x_sq<-x^2
x_sq
##  [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
##  [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11

#sum of sq of x

sum_x_sq<-sum(x_sq)
sum_x_sq
## [1] 4.236103e+15

#n*sum of sq of x

n*sum_x_sq
## [1] 2.160412e+17

#sq of sum x

sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16
(n*sum_x_sq)-(sq_sum_x)
## [1] 1.200254e+17

sq of y

y_sq<-y^2
y_sq
##  [1]   18225     361   53824    8649 1580049    4225    9409    1444    9801
## [10]  447561  141376      49     144  132496   20164     441    3969   13456
## [19]  123201     121   85849   13924  170569    2809   14400  103041     144
## [28]    1024    7056      25   60516    4489  267289   81796      16   96100
## [37]   12321    1296  208849     256   42849      64   47961  648025     484
## [46]       4   62500    8649     729    9409      25

sum of sq of y

sum_y_sq<-sum(y_sq)
sum_y_sq
## [1] 4521433

#n*sum of sq of y

n*sum_y_sq
## [1] 230593083

sq of sum y

sq_sum_y<-(sum_y)^2
sq_sum_y
## [1] 88416409

(n*sum_y_sq)-(sq_sum_y)
## [1] 142176674

denominator

deno<-((n*sum_x_sq)-(sq_sum_x))*((n*sum_y_sq)-(sq_sum_y))
deno
## [1] 1.706481e+25

sq root of denominator

sq_root_deno<-deno^0.5
sq_root_deno
## [1] 4.130958e+12

numerator / denominator(R)

r<-num/sq_root_deno
r
## [1] 0.9635956

#OR

cor(x,y)
## [1] 0.9635956

#q9: fit regression line of population on no of murders ## regression: #Regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables.# formula: \[{\sum y}/{n} = a + b \frac{\sum x}{n}\] \[b \frac{\sum xy - (\sum x)(\sum y)}{\sum x^2 - (\sum x)^2}\] \[a = \frac{\sum y}{n} - b \frac{\sum x}{n}\]

x<-c(murders$population)#x
y<-c(murders$total)#y

#sumof age

sum_x<-sum(x)
sum_x
## [1] 309864228

length of age

n<-length(x)
n
## [1] 51

sum of time

sum_y<-sum(y)
sum_y
## [1] 9403

length of time

n<-length(x)
n
## [1] 51

xy

xy<-x*y
xy
##  [1]   645264360    13494389  1482947944   271180374 46828222692   326897740
##  [7]   346687409    34121492    59570577 13171039857  3729920000     9522107
## [13]    18810984  4670350048   920699884    63973455   179746434   503366572
## [19]  1591213572    14611971  1691650736   772620222  4081943320   281108025
## [25]   356075640  1922445567    11872980    58442912   226846284     6582350
## [31]  2162805924   137964993 10018478734  2727148138     2690364  3576316240
## [37]   416399961   137918664  5804987203    16841072   957450348     6513440
## [43]  1389796995 20242176605    60805470     1251482  2000256000   625382220
## [49]    50030838   551637642     2818130

#sum of xy

sum_xy<-sum(xy)
sum_xy
## [1] 135180900360

#n*sum of xy

sum(xy)*n
## [1] 6.894226e+12

summ of x * sum of y

sum(x)*sum(y)
## [1] 2.913653e+12

numerator

num<-(sum(xy)*n)-(sum(x)*sum(y))
num
## [1] 3.980573e+12

#sq of x

sq_x<-x^2
sq_x
##  [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
##  [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11

#sum of sq of x

sum_sq_x<-sum(sq_x)
sum_sq_x
## [1] 4.236103e+15

#n*sum of x sq

n*sum_sq_x
## [1] 2.160412e+17

sq of sum of x

sq_sum_x<-(sum_x)^2
sq_sum_x
## [1] 9.601584e+16

denominator

deno<-(n*sum_sq_x)-(sq_sum_x)
deno
## [1] 1.200254e+17

#b

b<-num/deno
b
## [1] 3.316442e-05

mean of x

mean_x<- sum_x/n
mean_x
## [1] 6075769

mean of y

mean_y<-sum_y/n

#a

a<-(mean_y-b*mean_x)
a
## [1] -17.12682

#OR # Example data

x <- c(murders$population)
y <- c(murders$total)

Fit a linear model

model <- lm(y ~ x)

Extract the intercept (a) and slope (b) coefficients

a <- coef(model)[1]
b <- coef(model)[2]

display population of washington

washington_population<-murders$population[murders$state=='Washington']
print(washington_population)
## [1] 6724540

display no of murders at atlast

no_of_murders_at_Atlas<-murders$total[murders$state=='Atlas']
no_of_murders_at_Atlas
## numeric(0)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.