LabProject1

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

setwd("/Users/munira/Desktop/Education/Masters/Illinois Institute of Technology/Coursework/545-Stats/Data")
mydata <- read.csv("Project1Data.csv", header = TRUE)
pacman::p_load(ggplot2,ggthemes,lessR,prettyR,psych,QuantPsyc)

mydata$Y1a <- ifelse ( mydata$X1a == "c", 1, 0)
mydata$Y2a <- ifelse ( mydata$X2a == "d", 1, 0)
mydata$Y3a <- ifelse ( mydata$X3a == "b", 1, 0)
mydata$Y4a <- ifelse ( mydata$X4a == "a", 1, 0)
mydata$Y5a <- ifelse ( mydata$X5a == "c", 1, 0)
mydata$Y6a <- ifelse ( mydata$X6a == "c", 1, 0)
mydata$Y7a <- ifelse ( mydata$X7a == "a", 1, 0)
mydata$Y8a <- ifelse ( mydata$X8a == "c", 1, 0)
mydata$Y9a <- ifelse ( mydata$X9a == "d", 1, 0)
mydata$Y10a <- ifelse ( mydata$X10a == "c", 1, 0)
mydata$Y11a <- ifelse ( mydata$X11a == "a", 1, 0)
mydata$Y12a <- ifelse ( mydata$X12a == "b", 1, 0)
mydata$Y13a <- ifelse ( mydata$X13a == "c", 1, 0)
mydata$Y14a <- ifelse ( mydata$X14a == "a", 1, 0)
mydata$Y15a <- ifelse ( mydata$X15a == "c", 1, 0)

mydata$sumA<-with(mydata, Y1a+Y2a+Y3a+Y4a+Y5a+Y6a+Y7a+Y8a+Y9a+Y10a+Y11a+Y12a+Y13a+Y14a+Y15a)

mydata$Y3b <- ifelse ( mydata$X3b == 1, 5, ifelse ( mydata$X3b == 2, 4, ifelse ( mydata$X3b == 4, 2, ifelse ( mydata$X3b == 5, 1, 3) ) ) )
mydata$Y4b <- ifelse ( mydata$X4b == 1, 5, ifelse ( mydata$X4b == 2, 4, ifelse ( mydata$X4b == 4, 2, ifelse ( mydata$X4b == 5, 1, 3) ) ) )
mydata$Y5b <- ifelse ( mydata$X5b == 1, 5, ifelse ( mydata$X5b == 2, 4, ifelse ( mydata$X5b == 4, 2, ifelse ( mydata$X5b == 5, 1, 3) ) ) )
mydata$Y9b <- ifelse ( mydata$X9b == 1, 5, ifelse ( mydata$X9b == 2, 4, ifelse ( mydata$X9b == 4, 2, ifelse ( mydata$X9b == 5, 1, 3) ) ) )
mydata$Y12b <- ifelse ( mydata$X12b == 1, 5, ifelse ( mydata$X12b == 2, 4, ifelse ( mydata$X12b == 4, 2, ifelse ( mydata$X12b == 5, 1, 3) ) ) )
mydata$Y14b <- ifelse ( mydata$X14b == 1, 5, ifelse ( mydata$X14b == 2, 4, ifelse ( mydata$X14b == 4, 2, ifelse ( mydata$X14b == 5, 1, 3) ) ) )

mydata$Y1b<-as.numeric(mydata$X1b)
mydata$Y2b<-as.numeric(mydata$X2b)
mydata$Y6b<-as.numeric(mydata$X6b)
mydata$Y7b<-as.numeric(mydata$X7b)
mydata$Y8b<-as.numeric(mydata$X8b)
mydata$Y10b<-as.numeric(mydata$X10b)
mydata$Y11b<-as.numeric(mydata$X11b)
mydata$Y13b<-as.numeric(mydata$X13b)
mydata$Y15b<-as.numeric(mydata$X15b)

var.names<-c("Y1b","Y2b","Y3b","Y4b","Y5b","Y6b","Y7b","Y8b","Y9b","Y10b","Y11b","Y12b","Y13b","Y14b","Y15b")
mydata$avg.b<-rowMeans(mydata[var.names])

describe(mydata$sumA)

##    vars  n mean  sd median trimmed  mad min max range skew kurtosis   se
## X1    1 44 6.59 1.6      7    6.53 1.48   2  11     9 0.17     1.02 0.24

describe(mydata$avg.b)

##    vars  n mean   sd median trimmed mad  min  max range  skew kurtosis
## X1    1 43 3.68 0.34   3.67    3.69 0.3 2.87 4.33  1.47 -0.39    -0.06
##      se
## X1 0.05

table(mydata$Month.of.Birth)

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 
##  4  2  2  9  7  2  3  3  4  3  3  2

table(mydata$Home.State)

## 
##    CA CO CT GA IL LA MA MD MI MN MO MS NJ NV NY OK TN TX VA 
##  1  2  1  1  3  2 10  3  2  1  2  1  1  2  1  5  1  1  3  1

table(mydata$Pol..Party)

## 
##     D  N  R 
##  1 29  3 11

summary(mydata$GPA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.500   3.000   3.400   3.341   3.700   4.000

summary(mydata$Height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   59.00   63.00   65.00   65.45   67.00   73.00

summary(mydata$Time.on.Web)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   15.00   30.00   42.16   60.00  180.00

summary(mydata$Time.on.Games)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   8.205  10.000  60.000

summary(mydata$Time.Reading)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   30.00   60.00   66.82   90.00  250.00

summary(mydata$Time.on.TV)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.50   60.00   68.55   97.50  240.00

mydata$Z<-Make.Z(mydata$sumA)
mydata$Z2<-Make.Z(mydata$avg.b)

mydata$Z3<-mydata$Z*10
table(mydata$Z3)

## 
## -28.6209453370996 -16.1524146951948 -9.91814937424244 -3.68388405329005 
##                 1                 1                 8                11 
##  2.55038126766234  8.78464658861473  15.0189119095671  21.2531772305195 
##                14                 4                 3                 1 
##  27.4874425514719 
##                 1

describe(mydata$Z3)

##    vars  n mean sd median trimmed  mad    min   max range skew kurtosis
## X1    1 44    0 10   2.55   -0.39 9.24 -28.62 27.49 56.11 0.17     1.02
##      se
## X1 1.51

mydata$Z4<-mydata$Z3^2
table(mydata$Z4)

## 
## 6.50444461044297 13.5710017180847 77.1700156868604 98.3696870097856 
##               14               11                4                8 
## 225.567714947337 260.900500485546 451.697542391873 755.559498020468 
##                3                1                1                1 
## 819.158511989243 
##                1

describe(mydata$Z4)

##    vars  n  mean    sd median trimmed   mad min    max  range skew
## X1    1 44 97.73 177.2  13.57   55.19 10.48 6.5 819.16 812.65 2.85
##    kurtosis    se
## X1     7.99 26.71

Including Plots

You can also embed plots, for example:

setwd("/Users/munira/Desktop/Education/Masters/Illinois Institute of Technology/Coursework/545-Stats/Data")
qplot(mydata$sumA, geom = "density")

qplot(mydata$avg.b, geom = "density")

## Warning: Removed 1 rows containing non-finite values (stat_density).

plot(mydata$sumA, 1:44)

plot(mydata$avg.b, 1:44)

hist(mydata$GPA)

hist(mydata$Height)

hist(mydata$Time.on.Web)

hist(mydata$Time.on.Games)

hist(mydata$Time.Reading)

hist(mydata$Time.on.TV)

qplot(mydata$Z, geom = "density")

qplot(mydata$Z2, geom = "density")

## Warning: Removed 1 rows containing non-finite values (stat_density).

qplot(mydata$Z3, geom = "density")

qplot(mydata$Z4, geom = "density")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

PROJECT 1: DATA MANIPULATION SIMPLE GRAPHING OF DATA STANDARDIZATION

For this project, you should use the data file labeled ‘Project1Data’ available on the Google Classroom site. These data include the following variables: • ID is an ID code for each student • 1a through 15a are responses to multiple choice items related to statistics knowledge • GPA [Range 0.00 – 4.00] • Birth Month • Home State • “Pol. Party” [D or R] • Height [in inches] • Minutes spent per day on the web • Minutes spent per day playing video games • Minutes spent per day reading • Minutes spent per day watching TV • 1b through 15b are responses to items related to self-efficacy for statistics. [Items coded so that higher scores reflect greater self-efficacy]

ORIGINAL ANSWER: mydata <- read.csv(“/users/munira/Desktop/Project1Data.csv”, header = TRUE)

UPDATE: setwd(“/Users/munira/Desktop/Education/Masters/Illinois Institute of Technology/Coursework/545-Stats/Data”) mydata <- read.csv(“Project1Data.csv”, header = TRUE) pacman::p_load(ggplot2,ggthemes,lessR,prettyR,psych,QuantPsyc)

I’m not sure which packages would be required for the functions, hence enable all known packages

Prior to running any analyses, you will need to manipulate the data a bit. • Recode items 1a through 15a so that each response is either 0 (incorrect answer) or 1 (correct response) using the following answer key: • Correct Answer A: Items = 4a, 7a, 11a, 14a • Correct Answer B: Items = 3a, 12a • Correct Answer C: Items = 1a, 5a, 6a, 8a, 10a, 13a, 15a • Correct Answer D: Items = 2a, 9a

ORIGINAL ANSWER: mydata$Y1a <- ifelse ( mydata$X1a == “c”, 1, 0) mydata$Y2a <- ifelse ( mydata$X2a == “d”, 1, 0) mydata$Y3a <- ifelse ( mydata$X3a == “b”, 1, 0) mydata$Y4a <- ifelse ( mydata$X4a == “a”, 1, 0) mydata$Y5a <- ifelse ( mydata$X5a == “c”, 1, 0) mydata$Y6a <- ifelse ( mydata$X6a == “c”, 1, 0) mydata$Y7a <- ifelse ( mydata$X7a == “a”, 1, 0) mydata$Y8a <- ifelse ( mydata$X8a == “c”, 1, 0) mydata$Y9a <- ifelse ( mydata$X9a == “d”, 1, 0) mydata$Y10a <- ifelse ( mydata$X10a == “c”, 1, 0) mydata$Y11a <- ifelse ( mydata$X11a == “a”, 1, 0) mydata$Y12a <- ifelse ( mydata$X12a == “b”, 1, 0) mydata$Y13a <- ifelse ( mydata$X13a == “c”, 1, 0) mydata$Y14a <- ifelse ( mydata$X14a == “a”, 1, 0) mydata$Y15a <- ifelse ( mydata$X15a == “c”, 1, 0)

UPDATE: I tried doing it using the following code:

STEP 1 mydata$A<-mydata[ , c("X4a", "X7a", "X11a", "X14a")] mydata$B<-mydata[ , c(“X3a”, “X12a”)] mydata$C<-mydata[ , c("X1a", "X5a", "X6a", "X8a", "X10a", "X13a", "X15a")] mydata$D<-mydata[ , c(“X2a”, “X9a”)] OR mydata$A<-with(mydata, data.frame(X4a, X7a, X11a, X14a)) mydata$B<-with(mydata, data.frame(X3a, X12a)) mydata$C<-with(mydata, data.frame(X1a, X5a, X6a, X8a, X10a, X13a, X15a)) mydata$D<-with(mydata, data.frame(X2a, X9a)) OR mydata$A<- subset(mydata, select=c(X4a, X7a, X11a, X14a)) mydata$B<- subset(mydata, select=c(X3a, X12a)) mydata$C<- subset(mydata, select=c(X1a, X5a, X6a, X8a, X10a, X13a, X15a)) mydata$D<- subset(mydata, select=c(X2a, X9a))

STEP 2 mydata[A]<- ifelse ( mydata$A == "a", 1, 0) mydata[B]<- ifelse ( mydata$B == “b”, 1, 0) mydata[C]<- ifelse ( mydata$C == "c", 1, 0) mydata[D]<- ifelse ( mydata$D == “d”, 1, 0)

Step 1 worked fine but Step 2 gave the following error: Error in [<-.data.frame(*tmp*, A, value = c(1, 1, 0, 0, 1, 0, 0, 1, : object ‘A’ not found Error in [<-.data.frame(*tmp*, B, value = c(1, 1, 1, 1, 1, 0, 1, 0, : object ‘B’ not found Error in [<-.data.frame(*tmp*, C, value = c(1, 1, 0, 0, 0, 1, 1, 0, : anyNA() applied to non-(list or vector) of type ‘closure’ Error in [<-.data.frame(*tmp*, D, value = c(1, 1, 0, 0, 1, 1, 1, 1, : anyNA() applied to non-(list or vector) of type ‘closure’

• Create a new variable that is the sum of all the STATS items.

ORIGINAL ANSWER: mydata$sumA<-with(mydata, Y1a+Y2a+Y3a+Y4a+Y5a+Y6a+Y7a+Y8a+Y9a+Y10a+Y11a+Y12a+Y13a+Y14a+Y15a)

UPDATE: None

• Recode the following items from SECTION B such that (1=5)(2=4)(4=2)(5=1): • Items 3b, 4b, 5b, 9b, 12b, 14b

ORIGINAL ANSWER: mydata$Y3b <- ifelse ( mydata$X3b == 1, 5, ifelse ( mydata$X3b == 2, 4, ifelse ( mydata$X3b == 4, 2, ifelse ( mydata$X3b == 5, 1, 3) ) ) ) mydata$Y4b <- ifelse ( mydata$X4b == 1, 5, ifelse ( mydata$X4b == 2, 4, ifelse ( mydata$X4b == 4, 2, ifelse ( mydata$X4b == 5, 1, 3) ) ) ) mydata$Y5b <- ifelse ( mydata$X5b == 1, 5, ifelse ( mydata$X5b == 2, 4, ifelse ( mydata$X5b == 4, 2, ifelse ( mydata$X5b == 5, 1, 3) ) ) ) mydata$Y9b <- ifelse ( mydata$X9b == 1, 5, ifelse ( mydata$X9b == 2, 4, ifelse ( mydata$X9b == 4, 2, ifelse ( mydata$X9b == 5, 1, 3) ) ) ) mydata$Y12b <- ifelse ( mydata$X12b == 1, 5, ifelse ( mydata$X12b == 2, 4, ifelse ( mydata$X12b == 4, 2, ifelse ( mydata$X12b == 5, 1, 3) ) ) ) mydata$Y14b <- ifelse ( mydata$X14b == 1, 5, ifelse ( mydata$X14b == 2, 4, ifelse ( mydata$X14b == 4, 2, ifelse ( mydata$X14b == 5, 1, 3) ) ) )

UPDATE: I have flipped the scores instead of doing 6-x. Am I doing it right?

• Create a new variable for SELFEFFICACY that is the average of items 1b through 15b. Be sure to include the recoded items NOT the original items when computing this variable.

ORIGINAL ANSWER: mydata$Y1b<-as.numeric(mydata$X1b) mydata$Y2b<-as.numeric(mydata$X2b) mydata$Y6b<-as.numeric(mydata$X6b) mydata$Y7b<-as.numeric(mydata$X7b) mydata$Y8b<-as.numeric(mydata$X8b) mydata$Y10b<-as.numeric(mydata$X10b) mydata$Y11b<-as.numeric(mydata$X11b) mydata$Y13b<-as.numeric(mydata$X13b) mydata$Y15b<-as.numeric(mydata$X15b)

var.names<-c(“Y1b”,“Y2b”,“Y3b”,“Y4b”,“Y5b”,“Y6b”,“Y7b”,“Y8b”,“Y9b”,“Y10b”,“Y11b”,“Y12b”,“Y13b”,“Y14b”,“Y15b”) mydata$avg.b<-rowMeans(mydata[var.names])

UPDATE: Is there a way to change the class of multiple columns to numeric in one go? Also while calculting average, I should’ve added the function to ignore NAs: mydata$avg.b<-rowMeans(mydata[var.names], na.rm=TRUE)

Create a grouped frequency distribution (histogram) for the STATS scale scores and for the SELFEFFICACY scale scores.

ORIGINAL ANSWER: hist(mydata$sumA, bin) hist(mydata$avg.b)

UPDATE: qplot(mydata$sumA, geom = "density") qplot(mydata$avg.b, geom = “density”)

Gives a better idea of shape than a histogram

Provide a detailed interpretation of each of the grouped frequency distributions.

ORIGINAL ANSWER: For Stats: Range is 2-11 on x-axis and 0-12 on y-axis. Interval is 1 on X-axis abd 2 on y-axis. The plot is like a bell curve. For Efficacy: Range is 2.8-4.4 on x-axis and 0-12 on y-axis. Interval is 0.2 on X-axis abd 2 on y-axis. The plot is like a bell curve.

UPDATE: describe(mydata$sumA) describe(mydata$avg.b) Stats: The range of scores is from 2 to 11, with a mean of 6.59 and sd 1.6. The distribution is slightly positively skewed with kurtosis (leptokurtic). Self-efficacy: The range of scores is from 2.87 to 4.33, with a mean of 3.68 and sd 0.34. The distribution is negatively skewed.

Create an ungrouped frequency distribution (histogram) for the STATS scale scores and for the SELFEFFICACY scale scores.

ORIGINAL ANSWER: plot(mydata$sumA, 1:44) plot(mydata$avg.b, 1:44)

UPDATE: None

Are these distributions easier or more difficult to interpret than the grouped distribution? What would you recommend to someone who wanted to understand the distribution of scores? Why?

ORIGINAL ANSWER: The ungrouped distibutions are harder to interpret in both cases, while grouped distributions show a bell curve giving us a better idea of the distribution. Hence, the latter is recommended.

UPDATE: Stats: Majority of the scores are clustered between 5 and 7. There’s an outlier 2. The distribution appears to be positively skewed with long tails (platykurtic). Self-efficacy: The distribution appears to be negatively skewed with majority of the scores between 3.5 and 4.0. There are a few outliers below 3.0. The ungrouped distributions give a better idea of the scatter than the grouped frequency distributions in both cases, hence the former is recommended in these cases.

Compute and report the frequencies for the following variables (Birth Month, Home State, and Political Views). Use this information to provide an overview of the sample of individuals.

ORIGINAL ANSWER: table(mydata$Month.of.Birth) table(mydata$Home.State) table(mydata$Pol..Party)

The highest number of people are born in April, followed by May. Least number of births are in February, March, June and December. The highest number of people are from LA, which is an outlier with a difference of 5 from the nearest frequency. There are almost thrice the number of democrats as republicans. 3 people are neutral.

UPDATE: What other characteristics of the sample should I mention?

Create frequency distributions for all of the other demographic variables that you believe provide the best representation of the data (i.e., grouped or ungrouped).

ORIGINAL ANSWER: hist(mydata$GPA) hist(mydata$Height) hist(mydata$Time.on.Web) hist(mydata$Time.on.Games) hist(mydata$Time.Reading) hist(mydata$Time.on.TV)

UPDATE: What type of plots are suitable for what type of distributions?

Having looked at all distributions, provide a high level overview of the characteristics of this sample and their performance on the STATS test and their SELFEFFICACY.

ORIGINAL ANSWER: summary(mydata$sumA) summary(mydata$avg.b) summary(mydata$Month.of.Birth) summary(mydata$Home.State) summary(mydata$Pol..Party) summary(mydata$GPA) summary(mydata$Height) summary(mydata$Time.on.Web) summary(mydata$Time.on.Games) summary(mydata$Time.Reading) summary(mydata$Time.on.TV)

Average time spent on Web is 65.45 with a range of 59-73 Average time spent on Games is 8.205 with a range of 0-60 Average time spent Reading is 66.82 with a range of 5-240 Average time spent on TV is 68.55 with a range of 0-240

UPDATE: Stats: The range of scores is from 2 to 11, with a mean of 6.59, median of 7 and sd 1.6. The majority of the scores seem to be clustered in the middle as the 1st quartile is 6 and 3rd quartile 7. Median and 3rd quartile being the same shows high frequency at that point. Self-efficacy: The range of scores is from 2.87 to 4.33, with a mean of 3.68, a median of 3.67 and sd 0.34. Hence, mean and median are almost same. 1st quartile is 3.52 and 3rd quartile is 3.87.

Standardize the STATS and SELFEFFICACY scores.

ORIGINAL ANSWER: Z<-Make.Z(mydata$sumA) Z2<-Make.Z(mydata$avg.b)

UPDATE: mydata$Z<-Make.Z(mydata$sumA) mydata$Z2<-Make.Z(mydata$avg.b)

Which student (ID number) has the largest z-score on the STATS test? What is the z-score for this person? Which student (ID number) has the smallest z-score on the STATS test? What is the z-score for this person?

ORIGINAL ANSWER: Student 4 has the largest z-score of -2.8620945 Students 8, 13, 16, 17, 18, 20, 21, 22, 25, 26, 31, 32, 43, 44 have the smallest z-score of 0.2550381

UPDATE: So the largest score will always be the largest postive value and smallest score is largest negative value or smallest positive value. Thus, the largest z-score for this distribution is 2.75 by Student 1 and smallest is 2.86 by Student 4.

Which student (ID number) has the largest z-score on the SELFEFFICACY measure? What is the z-score for this person? Which student (ID number) has the smallest z-score on the SELFEFFICACY measure? What is the z-score for this person?

ORIGINAL ANSWER: Student 1 has the largest z-score of -2.36391978 Students 6, 13, 20, 21, 22, 37 have the smallest z-score of -0.03163946

UPDATE: The largest z-score for this distribution is 1.92 by Student 10 and smallest is 2.40 by Student 1.

Produce frequency distributions for the standardized STATS scores. Compare this distribution to the one you produced in Questions 1 and 2. Are they the same or different? Explain using both your graphical results and words.

ORIGINAL ANSWER: hist(Z) hist(Z2)

The z-score plot for Stats is more clustered in the center, while that for Efficacy is more spread out than the plots for Q1 & Q2. Hence, they are less of a bell curve.

UPDATE: qplot(mydata$Z, geom = "density") qplot(mydata$Z2, geom = “density”)

The distributions remain the same as earlier.

Multiply each person’s STATS score by 10. Produce a grouped frequency distribution on this new variable. Then square each person’s STATS score. Produce a grouped frequency distribution on this new variable.

ORIGINAL ANSWER: Not given, missed the question

UPDATE: mydata$Z3<-mydata$Z*10 table(mydata$Z3) describe(mydata$Z3) qplot(mydata$Z3, geom = "density") mydata$Z4<-mydata$Z3^2 table(mydata$Z4) describe(mydata$Z4) qplot(mydata$Z4, geom = “density”)

What can you say about the distributions for these two new variables in relation to the original STATS score distribution? Be sure to explain your answer using both your graphical results and words.

ORIGINAL ANSWER: Not given, missed the question

UPDATE: Multiplied by 10: Graphically, the distribution appears to be the same as earlier. The mean is now 0, due to standardization, with an sd of 10. The range of scores is from -28.62 to 27.49. The distribution is slightly positively skewed and leptokurtic - same as earlier. Squared: The distribution is much different now with resemblance to a pereto curve. The mean is 97.73 with an sd of 177.2 and range from 6.5 to 819.16. The distribution is significantly positively skewed and leptokurtic, much more than the earlier distribution.

LabProject1

MuniraAjmal

10/4/2019

R Markdown

Including Plots