Importing the file from system into R
data<-read.csv(file.choose(),header = T) # file.choose() used to point that file
data
## Names Age ID Gender Height Region Profit Talkitive
## 1 sujith 23 4567 male 5.11 A.P 41.9136 yes
## 2 kumar 24 3456 male 5.30 Andhra 219.5820 no
## 3 atanu 25 2153 male 5.50 Kolkata 6.8714 no
## 4 karum 27 2134 male 4.50 Pondichary -383.0310 yes
## 5 ravi 26 3214 male 5.40 Hyderabad 2.5164 yes
## 6 pavan 25 2614 male 3.40 Vijayavada 14.1694 no
## 7 shyama 22 2874 female 5.00 Kerala 1.9656 yes
## 8 pream 24 1234 male 2.10 Tamil nadu 90.7152 yes
# (or)
.csv file:-
data1<-read.table(file.choose(),header = T,sep = ",")
data1
Test file:-
data2<-read.delim(file.choose(),header = T,sep = "\t")
# (or)
data2<-read.delim(file.choose(),header = T)
data2
Descriptive Statistics
Descriptive statistics is the term given to the analysis of data that helps describe, show or summarize data in a meaningful way
Mean
to asses the data from table we need to attach it.
attach(data)
mean(Age)
## [1] 24.5
mean(Age,trim = 0.10) # to trim top and bottom 10%
## [1] 24.5
Mode
mode(Age) # gives what the data type
## [1] "numeric"
table
table(Age) # gives mode(how many times they repeated)
## Age
## 22 23 24 25 26 27
## 1 1 2 2 1 1
variance
var(Age)
## [1] 2.571429
standard deviation
sd(Age)
## [1] 1.603567
# (or)
sqrt(var(Age))
## [1] 1.603567
Maximun
max(Age)
## [1] 27
Minimum
min(Age)
## [1] 22
Range
range(Age)
## [1] 22 27
Quantile
quantile(Age,probs = 0.90)
## 90%
## 26.3
quantile(Age,probs = c(0,0.2,0.5,0.9,1))
## 0% 20% 50% 90% 100%
## 22.0 23.4 24.5 26.3 27.0
co-relation
cor(Age,Height)
## [1] -0.004837542
co-variance
cov(Age,Height) #var and cov are same
## [1] -0.009285714
var(Age,Height)
## [1] -0.009285714
summary
summary(Age) # summary gives only min,max,median,1st&3rd quartile
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.00 23.75 24.50 24.50 25.25 27.00
summary(data)
## Names Age ID Gender Height
## atanu :1 Min. :22.00 Min. :1234 female:1 Min. :2.100
## karum :1 1st Qu.:23.75 1st Qu.:2148 male :7 1st Qu.:4.225
## kumar :1 Median :24.50 Median :2744 Median :5.055
## pavan :1 Mean :24.50 Mean :2781 Mean :4.539
## pream :1 3rd Qu.:25.25 3rd Qu.:3274 3rd Qu.:5.325
## ravi :1 Max. :27.00 Max. :4567 Max. :5.500
## (Other):2
## Region Profit Talkitive
## A.P :1 Min. :-383.0310 no :3
## Andhra :1 1st Qu.: 2.3787 yes:5
## Hyderabad :1 Median : 10.5204
## Kerala :1 Mean : -0.6622
## Kolkata :1 3rd Qu.: 54.1140
## Pondichary:1 Max. : 219.5820
## (Other) :2
Normal distribution
pnorm(q=70,mean=24,sd=1.6,lower.tail = T) #P(x<=70)...70 %probablity..&.lower tail means less than 70
## [1] 1
pnorm(q=70,mean=24,sd=1.6,lower.tail = F)
## [1] 4.528788e-182
#lower tail is F so it gives above 70
pnorm(q=1,mean=0,sd=1,lower.tail = F) #p(z>=1) z-scores
## [1] 0.1586553
#qnorm is used to calculate percentils r quantiles
#Q1(1st quartile)
qnorm(p=0.25,mean = 24,sd=1.6,lower.tail = T)
## [1] 22.92082
charts
Barchart
barplot(Age,main = "Age",xlab = "age",ylab = "values",las=1)

barplot(Age,main = "Age",xlab = "age",ylab = "values")

Piechart
pie(Height,radius = 1,clockwise = T,labels = names(Height),edges = 100)

#labels gives numbers,edges gives finess of the curve
Boxplot
boxplot(Height)

boxplot(Height,main="Height",ylab="numbers",ylim=c(0,6),las=1)

# ylim gives y axis ranges from 0 to 6 and las gives numbers stright(hor instead of vertical)
boxplot(Age~Gender) # comparing two varables

boxplot(Age[Gender=="male"],Age[Gender=="female"]) #comparing 2 types in one varable

Histogram
hist(Age)

hist(Age,freq = F) #converting freq to density

hist(Age,breaks=7) #breaks gives no of box

Scatterplot
plot(Age,Height,main = "Scatterplot",las=1,xlim = c(20,30),pch=2,col=4)

# xlim= gives x axis range 20 to 30,pch gives shape,col= gives color
Stem and leaf
Height<-Height[Gender=="male"] #assigning height of gender(male) to height
stem(Height,scale=1) #scale=1 gives exact values..if scale=2 gives one time repeation
##
## The decimal point is at the |
##
## 2 | 1
## 3 | 4
## 4 | 5
## 5 | 1345
Stacked Bar chart(To find relationship btw categorical variables)
table<-table(Names,Gender)
barplot(table) #bars appear side by side

barplot(table,beside=T)

Mosaic plot
mosaicplot(table)

END