This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
Statistics
Modelling
Computation
Calculus
Installation of R
Installation of R studio
Calculator in r
Data importation in R
Data description using numerical measures and graphs.
x <- c(0.5, 0.6)
age<-c(20,35,32,29)
summary(age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 26.75 30.50 29.00 32.75 35.00
age<-c(35,24,18,24)
mean(age)
## [1] 25.25
x <- c(TRUE, FALSE)
x <- c(T, F)
x <- c("a", "b", "c" )
class<-c("M","F","F","M")
x <- 9:29
x <- c(1+0i, 2+4i)
x<- matrix(1:6, nrow = 2, ncol = 3)
x
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
(y<-matrix(1:6, nrow = 3, ncol = 2))
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
x%*%y
## [,1] [,2]
## [1,] 22 49
## [2,] 28 64
x <- list(1, "a", TRUE, 1 + 4i)
x
## [[1]]
## [1] 1
##
## [[2]]
## [1] "a"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1+4i
x <- factor(c("yes", "yes", "no", "yes", "no"))
x <- c(1, 2, NA, 10, 3)
Return a logical vector indicating which elements are NA
is.na(x)
## [1] FALSE FALSE TRUE FALSE FALSE
x <- c(1, 2, 4, "NA", 5)
bad <- is.na(x)
print(bad)
## [1] FALSE FALSE FALSE FALSE FALSE
x[!bad]
## [1] "1" "2" "4" "NA" "5"
What if there are multiple R objects and you want to take the subset with no missing values in any of those objects?
x <- c(1, 2, NA, 4, NA, 5)
y <- c("a", "b", NA, "d", NA, "f")
good <- complete.cases(x, y)
good
## [1] TRUE TRUE FALSE TRUE FALSE TRUE
If character is present, in a vector, R convert everything in the vector to character strings.
If a vector only contains logical and numbers, R will convert the logical to numbers, Every true becomes a 1, and every FALSE becomes 0
sum(c(TRUE,TRUE,FALSE,FALSE,FALSE))
## [1] 2
Create data Using data frame Data frame is more general than matrix How??? Because data frame can contain different modes of data (Numeric, character and so on) Similar to what you can see in SPSS, SAS,…
###Let’s create a data frame
studentID<-c(1,2,3,4,5)
math_score<-c(12,17,10,9,NA)
gender<-c("M","F","M","M","F")
it_score<-c(13,18,11,10,19)
scoredata<-data.frame(studentID,gender,math_score,it_score)
scoredata
## studentID gender math_score it_score
## 1 1 M 12 13
## 2 2 F 17 18
## 3 3 M 10 11
## 4 4 M 9 10
## 5 5 F NA 19
#View(scoredata)
Steps 1. Create data frame (or matrix) with variable names 2. Invoke the text editor in the data objected created at first step
data_class2<-data.frame(height=numeric(0),weight=numeric(0),bmi=numeric(0))
data_class2<-edit(data_class2)
R has some features that can allow to import data from different sources ( It can be text file, spreadsheet, or database)
data_class<-read.table("C:\\Users\\Pacy\\OneDrive\\Desktop\\Big data course\\class_data.txt")
variable.names(data_class)
## [1] "HEIGHT" "WEIGHT"
head(data_class)
## HEIGHT WEIGHT
## 1 161 50
## 2 155 49
## 3 158 42
## 4 170 65
## 5 160 60
## 6 156 52
tail(data_class)
## HEIGHT WEIGHT
## 38 164 47
## 39 163 52
## 40 168 55
## 41 157 48
## 42 164 58
## 43 154 55
data_class[10:20,]
## HEIGHT WEIGHT
## 10 167 51
## 11 160 60
## 12 155 42
## 13 154 53
## 14 155 48
## 15 157 48
## 16 157 48
## 17 160 53
## 18 158 52
## 19 160 51
## 20 160 53
summary(data_class$WEIGHT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 42.00 48.00 52.00 52.47 56.00 65.00
length(data_class$WEIGHT)
## [1] 43
data_class[,-1]
## [1] 50 49 42 65 60 52 58 46 45 51 60 42 53 48 48 48 53 52 51 53 44 56 63 52 57
## [26] 49 52 54 46 50 61 55 45 63 60 56 52 47 52 55 48 58 55
In case you want to use data set built in R
data() # list of datasets currently available
data("airquality")
variable.names(airquality)
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day"
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
data_class
## HEIGHT WEIGHT
## 1 161 50
## 2 155 49
## 3 158 42
## 4 170 65
## 5 160 60
## 6 156 52
## 7 162 58
## 8 158 46
## 9 158 45
## 10 167 51
## 11 160 60
## 12 155 42
## 13 154 53
## 14 155 48
## 15 157 48
## 16 157 48
## 17 160 53
## 18 158 52
## 19 160 51
## 20 160 53
## 21 152 44
## 22 154 56
## 23 150 63
## 24 161 52
## 25 162 57
## 26 164 49
## 27 161 52
## 28 155 54
## 29 159 46
## 30 163 50
## 31 159 61
## 32 160 55
## 33 158 45
## 34 165 63
## 35 156 60
## 36 163 56
## 37 155 52
## 38 164 47
## 39 163 52
## 40 168 55
## 41 157 48
## 42 164 58
## 43 154 55
summary(data_class)
## HEIGHT WEIGHT
## Min. :150.0 Min. :42.00
## 1st Qu.:156.0 1st Qu.:48.00
## Median :159.0 Median :52.00
## Mean :159.3 Mean :52.47
## 3rd Qu.:162.0 3rd Qu.:56.00
## Max. :170.0 Max. :65.00
str(data_class)
## 'data.frame': 43 obs. of 2 variables:
## $ HEIGHT: int 161 155 158 170 160 156 162 158 158 167 ...
## $ WEIGHT: int 50 49 42 65 60 52 58 46 45 51 ...
summary(data_class)
## HEIGHT WEIGHT
## Min. :150.0 Min. :42.00
## 1st Qu.:156.0 1st Qu.:48.00
## Median :159.0 Median :52.00
## Mean :159.3 Mean :52.47
## 3rd Qu.:162.0 3rd Qu.:56.00
## Max. :170.0 Max. :65.00
apply(data_class,1,mean)
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 105.5 102.0 100.0 117.5 110.0 104.0 110.0 102.0 101.5 109.0 110.0 98.5 103.5
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 101.5 102.5 102.5 106.5 105.0 105.5 106.5 98.0 105.0 106.5 106.5 109.5 106.5
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 106.5 104.5 102.5 106.5 110.0 107.5 101.5 114.0 108.0 109.5 103.5 105.5 107.5
## 40 41 42 43
## 111.5 102.5 111.0 104.5
apply(data_class,2,sd)
## HEIGHT WEIGHT
## 4.304444 5.750150
c(mean(data_class$HEIGHT),sd(data_class$HEIGHT))
## [1] 159.255814 4.304444
c(mean(data_class$WEIGHT),sd(data_class$WEIGHT))
## [1] 52.46512 5.75015
c(Mean=mean(data_class$HEIGHT),SD=sd(data_class$HEIGHT))
## Mean SD
## 159.255814 4.304444
c(Mean=mean(data_class$WEIGHT),SD=sd(data_class$WEIGHT))
## Mean SD
## 52.46512 5.75015
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
mean(iris$Petal.Length)
## [1] 3.758
mean(iris$Petal.Length[iris$Species=="setosa"])
## [1] 1.462
mean(iris$Petal.Length[iris$Species=="versicolor"])
## [1] 4.26
mean(iris$Petal.Length[iris$Species=="virginica"])
## [1] 5.552
## shortcut
by(iris$Petal.Length,iris$Species,mean)
## iris$Species: setosa
## [1] 1.462
## ------------------------------------------------------------
## iris$Species: versicolor
## [1] 4.26
## ------------------------------------------------------------
## iris$Species: virginica
## [1] 5.552
by(iris$Petal.Length,iris$Species,sd)
## iris$Species: setosa
## [1] 0.173664
## ------------------------------------------------------------
## iris$Species: versicolor
## [1] 0.469911
## ------------------------------------------------------------
## iris$Species: virginica
## [1] 0.5518947
by(iris$Petal.Length,iris$Species,summary)
## iris$Species: setosa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.400 1.500 1.462 1.575 1.900
## ------------------------------------------------------------
## iris$Species: versicolor
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 4.00 4.35 4.26 4.60 5.10
## ------------------------------------------------------------
## iris$Species: virginica
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.500 5.100 5.550 5.552 5.875 6.900
boxplot(iris$Sepal.Length~iris$Species,data=iris, main="Comparison")
ucba<-data.frame(UCBAdmissions)
head(ucba)
## Admit Gender Dept Freq
## 1 Admitted Male A 512
## 2 Rejected Male A 313
## 3 Admitted Female A 89
## 4 Rejected Female A 19
## 5 Admitted Male B 353
## 6 Rejected Male B 207
cross<-xtabs(Freq~Gender+Admit,data=ucba)
(cross<-xtabs(Freq~Gender+Admit,data=ucba))
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
## Is there gender bias in UCB graduate admission process?
prop.table(cross,2)
## Admit
## Gender Admitted Rejected
## Male 0.6826211 0.5387947
## Female 0.3173789 0.4612053
Phenomenon, where a trend that appears in combined groups of data disappears or reverses when broken down into groups.
cross2<-xtabs(Freq~Gender+Admit,data=ucba[ucba$Dept=="A",])
prop.table(cross2,1)
## Admit
## Gender Admitted Rejected
## Male 0.6206061 0.3793939
## Female 0.8240741 0.1759259
dat <- read.table(text ="ProdA ProdB ProdC ProdD
1 110 50 60 70
2 120 50 80 65", header= TRUE)
barplot(as.matrix(dat),beside=FALSE,col=c("Red","green"))
#barplot(as.matrix(dat),beside=TRUE,col=c("gold3","red"))
dat <- read.table(text = "A B C D E F G
1 10 80 30 90 70 60 90
2 20 50 70 50 40 10 40
3 60 80 80 60 60 30 160
4 20 40 70 80 20 10 70", header = TRUE)
barplot(as.matrix(dat))
plot(WEIGHT~HEIGHT,data=data_class)
plot(data_class$HEIGHT,data_class$WEIGHT)
plot(data_class[,1],data_class[,2],main = "Graph")
cor(data_class$HEIGHT,data_class$WEIGHT)
## [1] 0.2663495
plot(WEIGHT~HEIGHT,data=data_class)
abline(lm(WEIGHT~HEIGHT,data=data_class)$coefficient)
data_lm<-lm(HEIGHT~WEIGHT,data=data_class)
summary(data_lm)
##
## Call:
## lm(formula = HEIGHT ~ WEIGHT, data = data_class)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3563 -2.6662 0.2326 2.0378 8.2449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.7951 5.9466 25.022 <2e-16 ***
## WEIGHT 0.1994 0.1127 1.769 0.0843 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.199 on 41 degrees of freedom
## Multiple R-squared: 0.07094, Adjusted R-squared: 0.04828
## F-statistic: 3.131 on 1 and 41 DF, p-value: 0.08427
hist(data_class$HEIGHT)
hist(data_class$WEIGHT,col = "blue")
plot(WEIGHT~HEIGHT,data=data_class)
abline(lm(WEIGHT~HEIGHT,data=data_class)$coefficient)
lines(lowess(data_class$HEIGHT,data_class$WEIGHT),col="blue")
cor(data_class$HEIGHT,data_class$WEIGHT)
## [1] 0.2663495
lm(WEIGHT~HEIGHT,data=data_class)
##
## Call:
## lm(formula = WEIGHT ~ HEIGHT, data = data_class)
##
## Coefficients:
## (Intercept) HEIGHT
## -4.1992 0.3558
attach(data_class)
(BMI<-WEIGHT/(HEIGHT/100)^2)
## [1] 19.28938 20.39542 16.82423 22.49135 23.43750 21.36752 22.10029 18.42653
## [9] 18.02596 18.28678 23.43750 17.48179 22.34778 19.97919 19.47341 19.47341
## [17] 20.70312 20.83000 19.92187 20.70312 19.04432 23.61275 28.00000 20.06095
## [25] 21.71925 18.21832 20.06095 22.47659 18.19548 18.81892 24.12879 21.48437
## [33] 18.02596 23.14050 24.65483 21.07720 21.64412 17.47472 19.57168 19.48696
## [41] 19.47341 21.56454 23.19109
(BMI<-round(WEIGHT/(HEIGHT/100)^2,digit=2))
## [1] 19.29 20.40 16.82 22.49 23.44 21.37 22.10 18.43 18.03 18.29 23.44 17.48
## [13] 22.35 19.98 19.47 19.47 20.70 20.83 19.92 20.70 19.04 23.61 28.00 20.06
## [25] 21.72 18.22 20.06 22.48 18.20 18.82 24.13 21.48 18.03 23.14 24.65 21.08
## [37] 21.64 17.47 19.57 19.49 19.47 21.56 23.19
head(cbind(data_class,BMI),n=5)
## HEIGHT WEIGHT BMI
## 1 161 50 19.29
## 2 155 49 20.40
## 3 158 42 16.82
## 4 170 65 22.49
## 5 160 60 23.44
new_data_class<-cbind(data_class,BMI)
tail(cbind(data_class,BMI),n=10)
## HEIGHT WEIGHT BMI
## 34 165 63 23.14
## 35 156 60 24.65
## 36 163 56 21.08
## 37 155 52 21.64
## 38 164 47 17.47
## 39 163 52 19.57
## 40 168 55 19.49
## 41 157 48 19.47
## 42 164 58 21.56
## 43 154 55 23.19
summary(BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.82 19.16 20.40 20.70 22.23 28.00
stem(BMI,scale=2)
##
## The decimal point is at the |
##
## 16 | 8
## 17 | 55
## 18 | 0022348
## 19 | 03555569
## 20 | 0114778
## 21 | 145667
## 22 | 1455
## 23 | 12446
## 24 | 17
## 25 |
## 26 |
## 27 |
## 28 | 0
today <- Sys.Date()
format(today, format="%B %d %Y")
## [1] "May 24 2026"
format(today, format="%A")
## [1] "Sunday"
#today <- Sys.Date()
dob <- as.Date("2000-02-10")
today<-as.Date("2024-8-25")
difftime(today, dob, units="days")
## Time difference of 8963 days
startdate <- as.Date("2004-02-13")
enddate <- as.Date("2011-01-22")
days <- enddate - startdate
days
## Time difference of 2535 days
#Time difference of 2535 days
Health_post<-c(0,2,0)
Health_center<-c(1,15,2)
Private_facility<-c(0,0,2)
District_hospital<-c(9,116,17)
Referral_hospital<-c(64,95,9)
Provincial_hospital<-c(2, 15,2)
df<-data df<-data.frame(Health_post=c(0,2,0),Health_center=c(1,15,2),Private_facility=c(0,0,2),District_hospital=c(9,116,17),Referral_hospital=c(64,95,9),Provincial_hospital=c(2, 15,2)) df1<-data.frame(Health_post,Health_center,Private_facility,District_hospital,Referral_hospital,Provincial_hospital)
rownames(df1)<-c(“Not Avoidable”, “Potentially avoidable”,“Undetermined”)
library(“gplots”)
dt<-as.table(as.matrix(df1)) library(“graphics”)
dt<-as.table(as.matrix(t(df1))) #dt<-as.table(as.matrix(df)) library(“graphics”) mosaicplot(dt,shade=TRUE,las=2,main=“Avoidable deaths”)
dt print(dt)
chisq<-chisq.test(dt) ############################## Health_post<-c(0,2,0)
Health_center<-c(3,9,2)
Private_facility<-c(0,1,0)
District_hospital<-c(43,81,24)
Referral_hospital<-c(33,86,48)
Provincial_hospital<-c(2, 15,2)
#Rownames<-c(“Health post”,”Heath Center”,”Private facility”,”District Hospital”,”Referral Hospital”,”Provincial Hospital”) Df2<-data.frame(Health_post,Health_center,Private_facility,District_hospital,Referral_hospital,Provincial_hospital)
rownames(Df2)<-c(“No High Risk”, “High Risk”,“Undetermined”)
dt<-as.table(as.matrix(Df2)) library(“graphics”)
dt<-as.table(as.matrix(t(Df2))) #dt<-as.table(as.matrix(df)) library(“graphics”) #mosaicplot(dt,shade=TRUE,las=2,main=“Avoidable deaths”) dt print(dt) chisq<-chisq.test(dt)