###Start of assignment###

library(ggplot2) #load ggplot library

setwd(“C:/Users/magnu/Documents/R”) #set the working directory

IMPORT THE DATA

nukedata <- read.csv(“nuclear.csv”) #import Nuclear power station construction data summary(nukedata) #explore data characteristics

#Initial data conclusions: #(1) ‘cost’: cost of construction in hundreds of millions #(2) ‘date’: months of construction #(3) ‘cap’: power generation capacity in MW #(4) ‘pr’,‘ne’,‘ct’,‘bw’,‘pt’: are all either 0 or 1, thus they likely mark that the station does or doesn’t have a certain characteristic

#Meaningful Question: what is the relationship between cost, date, and/or capacity for nuclear power station construction?

WRANGLE THE DATA

#Create subset of useful data - cost, date, and cap columns

nuke_df <- data.frame(X = nukedata\(cap[1:32], Y = nukedata\)date[1:32], Z = nukedata$cost[1:32])

#Create new column names for the new data frame.

names(nuke_df)[1] <- “Capacity (MW)” names(nuke_df)[2] <- “Construction Duration (mos)” names(nuke_df)[3] <- “Cost ($ millions)”

#Review relevant data for the new dataframe prior to plotting summary(nuke_df)

PLOT THE DATA

#Scatter plot of the construction duration #Min:67.17, Max: 71.08, Mean: 68.58 plot(nuke_df$Construction Duration (mos))

hist(nuke_df\(`Cost (\) millions)`) #Histogram of the cost of construction (in millions of dollars) #200-300: 8, 300-400: 4, 400-500:10, 500-600: 1, 600-700: 7, 700-800: 1, 800-900: 1

boxplot(nuke_df$Capacity (MW)) #Boxplot of the capacity for generation after construction #min: 457, 1st Q: 745, Median: 822, 3rd Q: 947, Max: 1130

#apply ggplot to this dataset and interpret the results

qplot(Construction Duration (mos), Capacity (MW), data=nuke_df) #there is not a clear relationship between duration and capacity

qplot(Construction Duration (mos), Cost ($ millions), data=nuke_df) #there seems to be a correlation between duration of construction and cost

qplot(Capacity (MW), Cost ($ millions), data=nuke_df) #there seems to be a correlation between capacity and cost

#Answer: there is a relationship between the chosen subset variables. #While there is NOT a clear relationship between the duration and capacity, #there is a relationship between the duration and cost of construction AND #the capacity and cost of construction.

#Longer construction projects (ie. greater duration) and larger construction #projects (ie. greater capacity) can be directly tied to a greater cost. #Thus if we’re trying to lower the cost we can focus on one or both of these #variables to do so. This can be useful [LEFT OFF HERE] From this direct relationship, #we can indirectly tie the duration of the project to the

###BONUS: read the .csv file from github###

library(RCurl) x <- getURL(“https://raw.githubusercontent.com/Magnus-PS/CUNY-Bridge/master/nuclear.csv”) y <- read.csv(text = x) summary(y)

###End of assignment###