source("http://www.openintro.org/stat/data/cdc.R")
#1.) Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables.
plot(cdc$weight,cdc$wtdesire)

#2.) Let’s consider a new variable: the difference between desired weight (wtdesire) and current weight (weight). Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called wdiff.
wdiff <- cdc$wtdesire-cdc$weight
#showing the difference between weight desire and actual weight. If the difference is negative, weight desire is less than actual weight. If the difference is positive, weight desire is greater than actual weight. For zero, weight desire= actual weight
#3.) What type of data is wdiff? If an observation wdiff is 0, what does this mean about the person’s weight and desired weight. What if wdiff is positive or negative?
boxplot(wdiff)

summary(wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
#4.) Describe the distribution of wdiff in terms of its center, shape, and spread, including any plots you use. What does this tell us about how people feel about their current weight?
hist(wdiff, breaks=50) #most people are satisfied with their weight. According to histogram, the hight bar is in range from -20 to 0. So people most likely not overweight. At least over 20 lbs from their actual weight

plot(wdiff)

gen_diff <- data.frame(wdiff, cdc$gender)
men_wm <- subset(gen_diff, cdc.gender == "m")
summary(men_wm)
## wdiff cdc.gender
## Min. :-300.00 m:9569
## 1st Qu.: -20.00 f: 0
## Median : -5.00
## Mean : -10.71
## 3rd Qu.: 0.00
## Max. : 500.00
wm_men <- subset(gen_diff, cdc.gender == "f")
summary(wm_men)
## wdiff cdc.gender
## Min. :-300.00 m: 0
## 1st Qu.: -27.00 f:10431
## Median : -10.00
## Mean : -18.15
## 3rd Qu.: 0.00
## Max. : 83.00
#5.) Using numerical summaries and a side-by-side box plot, determine if men tend to view their weight differently than women.
boxplot(gen_diff$wdiff ~ gen_diff$cdc.gender)

#6.) Now it’s time to get creative. Find the mean and standard deviation of weight and determine what proportion of the weights are within one standard deviation of the mean.
avg_wt <- mean(cdc$weight) #average of weight
sd_wt <- sd(cdc$weight) #standard deviation of weight
one_dev <- subset(cdc, weight < (avg_wt + sd_wt) & weight > (avg_wt - sd_wt)) #comparing weigh with average and standard deviation of weight
dim(one_dev)[1]/dim(cdc)[1]
## [1] 0.7076
Pokemon <- read.csv("Pokemon.csv")
#1.) Use the summary command to show a summary of your dataset.
summary(Pokemon)
## X. Name Type.1 Type.2
## Min. : 1.0 Abomasnow : 1 Water :112 :386
## 1st Qu.:184.8 AbomasnowMega Abomasnow: 1 Normal : 98 Flying : 97
## Median :364.5 Abra : 1 Grass : 70 Ground : 35
## Mean :362.8 Absol : 1 Bug : 69 Poison : 34
## 3rd Qu.:539.2 AbsolMega Absol : 1 Psychic: 57 Psychic : 33
## Max. :721.0 Accelgor : 1 Fire : 52 Fighting: 26
## (Other) :794 (Other):342 (Other) :189
## Total HP Attack Defense
## Min. :180.0 Min. : 1.00 Min. : 5 Min. : 5.00
## 1st Qu.:330.0 1st Qu.: 50.00 1st Qu.: 55 1st Qu.: 50.00
## Median :450.0 Median : 65.00 Median : 75 Median : 70.00
## Mean :435.1 Mean : 69.26 Mean : 79 Mean : 73.84
## 3rd Qu.:515.0 3rd Qu.: 80.00 3rd Qu.:100 3rd Qu.: 90.00
## Max. :780.0 Max. :255.00 Max. :190 Max. :230.00
##
## Sp..Atk Sp..Def Speed Generation Legendary
## Min. : 10.00 Min. : 20.0 Min. : 5.00 Min. :1.000 False:735
## 1st Qu.: 49.75 1st Qu.: 50.0 1st Qu.: 45.00 1st Qu.:2.000 True : 65
## Median : 65.00 Median : 70.0 Median : 65.00 Median :3.000
## Mean : 72.82 Mean : 71.9 Mean : 68.28 Mean :3.324
## 3rd Qu.: 95.00 3rd Qu.: 90.0 3rd Qu.: 90.00 3rd Qu.:5.000
## Max. :194.00 Max. :230.0 Max. :180.00 Max. :6.000
##
#2.) Create a histogram of one of the numeric variables in your dataset. Describe the distribution of the variable for which you created a histogram.
hist(Pokemon$Attack)

#3.) Create a boxplot which compares values of a selected variable accross 2 or more categories. What insights do you draw from the boxplot?
atkdiff <- Pokemon$Attack-Pokemon$Defense
def_diff <- data.frame(atkdiff, Pokemon$Generation)
boxplot(def_diff$atkdiff ~ Pokemon$Generation)

#4.) Create a scatterplot of two numeric variables, and describe the relationship between the two variables.
plot(Pokemon$Attack,Pokemon$HP)
