# Read the data into a dataframe. say "m1"
m1<-read.csv("C:/Users/pradeep/OneDrive/datasets/ssc_inter.csv")
head(m1)
## SNO REGNO NAME BRANCH SSC INTER B.TECH.
## 1 1 12PA1A0501 ABDUL IMROZ CSE 85.67 66.3 65.07
## 2 2 12PA1A0502 ADIREDDY DIVYASRI CSE 82.83 83.4 64.50
## 3 3 12PA1A0504 AMUDALA MANIKIRAN KUMAR CSE 81.83 91.5 66.70
## 4 4 12PA1A0510 BALUSU KIRAN KUMAR CSE 78.50 82.7 65.70
## 5 5 12PA1A0518 CHAGANTI AMRUTHA CSE 84.50 93.6 80.00
## 6 6 12PA1A0519 CHALAVADI RAJESH CSE 89.67 94.4 76.90
# Apply linear regression. Here "SSC" is predictor and "INTER" is responce
relation<-lm(INTER~SSC,data = m1)
print(relation)
##
## Call:
## lm(formula = INTER ~ SSC, data = m1)
##
## Coefficients:
## (Intercept) SSC
## 53.5628 0.3895
# Now, read the data having errors "noisedata.csv" into a dataframe "m2"
m2<-read.csv("C:/Users/pradeep/OneDrive/datasets/noisedata.csv")
head(m2)
## S.No Roll.No Student.Name Date.of.Birth SSC.. INTER..
## 1 28 10PA1A0592 PATURI RAVALI 27/05/1993 88.50 900.20
## 2 30 10PA1A0594 PERIKALA SAI KIRAN 8/7/1993 82.33 8.10
## 3 55 10PA1A0558 KONAKALLA SWATHI 8/7/1993 80.10 NA
## 4 40 10PA1A0541 GANASALA SURESH 2/5/1992 84.16 77.90
## 5 60 11PA5A0503 CH KUMAR VIJAYA MOULI 6/5/1993 77.80 81.13
## B.Tech.Upto.3.2 Backlogs..if.cleared.don.t.specify. Elephos TIME
## 1 79.733 0 TIME
## 2 67.850 0 ELEPHOS TIME
## 3 76.400 0 TIME
## 4 69.430 0 ELEPHOS TIME
## 5 66.600 0 ELEPHOS TIME
## company.selected
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
# now create a dataframe a with only 1 colomn i.e "SSC".
# Note: The name of the predictor variable in "m1" and "a" should be same.
a <- data.frame(SSC = m2$SSC..)
print(a)
## SSC
## 1 88.50
## 2 82.33
## 3 80.10
## 4 84.16
## 5 77.80
# now predict the "INTER" percentage using "predict" function
result <- predict(relation,a)
print(result)
## 1 2 3 4 5
## 88.02913 85.62622 84.75774 86.33891 83.86201
# save the predicted "INTER percentage in dataframe "m2"- "INTER" colomn.
m2$INTER..<-result
head(m2)
## S.No Roll.No Student.Name Date.of.Birth SSC.. INTER..
## 1 28 10PA1A0592 PATURI RAVALI 27/05/1993 88.50 88.02913
## 2 30 10PA1A0594 PERIKALA SAI KIRAN 8/7/1993 82.33 85.62622
## 3 55 10PA1A0558 KONAKALLA SWATHI 8/7/1993 80.10 84.75774
## 4 40 10PA1A0541 GANASALA SURESH 2/5/1992 84.16 86.33891
## 5 60 11PA5A0503 CH KUMAR VIJAYA MOULI 6/5/1993 77.80 83.86201
## B.Tech.Upto.3.2 Backlogs..if.cleared.don.t.specify. Elephos TIME
## 1 79.733 0 TIME
## 2 67.850 0 ELEPHOS TIME
## 3 76.400 0 TIME
## 4 69.430 0 ELEPHOS TIME
## 5 66.600 0 ELEPHOS TIME
## company.selected
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
An item once selected from the data set, it is again kept in the same place constituting original population.Means, a same item may be sampled more than once.
# Import the dataset
m1<-read.csv("C:/Users/pradeep/OneDrive/dm,cns,cp and JKC/data mining/dm files 2020 passouts/lab/table_2.csv")
# Print the dataset
m1
## Sl.No Roll.No Name SSC.Perc inter.Diploma.perc
## 1 1 11PA1A0507 X 67 78
## 2 2 11PA1A0508 Y 56 77
## 3 3 11PA1A0509 Z 43 45
## 4 4 11PA1A0510 R 88 65
## 5 5 11PA1A0511 e 34 98
# Use the sample function. For sampling with replacement, make sure to keep "replace = TRUE" and specify the number of rows to be sampled (In this case 4)
m2<-m1[sample(nrow(m1),4,replace = TRUE),]
print(m2)
## Sl.No Roll.No Name SSC.Perc inter.Diploma.perc
## 5 5 11PA1A0511 e 34 98
## 1 1 11PA1A0507 X 67 78
## 5.1 5 11PA1A0511 e 34 98
## 1.1 1 11PA1A0507 X 67 78
you may notice that some rows are repeated in the above table.
An item once selected from the data set, it is removed from the original population (data set).Means, an item may be sampled once or it may not be sampled.
# Import the dataset
m1<-read.csv("C:/Users/pradeep/OneDrive/dm,cns,cp and JKC/data mining/dm files 2020 passouts/lab/table_2.csv")
# Print the dataset
m1
## Sl.No Roll.No Name SSC.Perc inter.Diploma.perc
## 1 1 11PA1A0507 X 67 78
## 2 2 11PA1A0508 Y 56 77
## 3 3 11PA1A0509 Z 43 45
## 4 4 11PA1A0510 R 88 65
## 5 5 11PA1A0511 e 34 98
# Use the sample function. For sampling with replacement, make sure to keep "replace = TRUE" and specify the number of rows to be sampled (In this case 4)
m2<-m1[sample(nrow(m1),4,replace = FALSE),]
print(m2)
## Sl.No Roll.No Name SSC.Perc inter.Diploma.perc
## 1 1 11PA1A0507 X 67 78
## 2 2 11PA1A0508 Y 56 77
## 3 3 11PA1A0509 Z 43 45
## 4 4 11PA1A0510 R 88 65
bp<-read.csv("C:/Users/pradeep/OneDrive/dm,cns,cp and JKC/data mining/dm files 2020 passouts/lab/data_for_boxplot.csv")
bp
## CSE.A CSE.B
## 1 87.3 65.3
## 2 89.0 92.4
## 3 67.0 68.0
## 4 71.0 70.4
## 5 67.0 65.5
## 6 77.0 98.0
## 7 88.0 87.0
## 8 99.0 76.0
## 9 56.0 65.0
## 10 57.0 54.0
## 11 45.0 43.0
boxplot(bp$CSE.A,bp$CSE.B,names=c("CSE A","CSE B"))
From the above boxplot, we can say that CSE A is better performer than CSE B. Because 50 percent of the people scored above 70. And out of this, 25 percent of the people above 88. But in CSE B top 25 percent are having marks from 80.