-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot3.R
340 lines (256 loc) · 14.9 KB
/
plot3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
################################################################################
# WHAT YOU NEED TO KNOW BEFORE EXECUTING THIS SCRIPT
################################################################################
# DESCRIPTION: The script 'plot3.R', constructs and saves a multiplot,
# that presents some core aspects of the changes in emissions
# over time by the type of source, for the Baltimore City.
# THE QUESTION: The plot tries to answer the question:
#
# Of the four types of sources indicated by the type
# (point, nonpoint, onroad, nonroad) variable,
# which of these four sources have seen decreases in emissions
# from 1999–2008 for Baltimore City?
# Which have seen increases in emissions from 1999–2008?
# WORK-FLOW: The script executes the following STEPS:
# 0. Loads the required libraries.
# 1. Downloads and extracts the data file if it doesn't exist.
# 2. Loads the data file in R.
# 3. Subsets the target variables and observations from the data.
# 4: Creates the data frames needed to construct each plot.
# 5. Creates all elementary plots that compose the multiplot.
# 6. Combines the plots, to create the multiplot,
# and saves it in the working directory.
# ABOUT THE DATA USED
# The data used to create the plot,
# comes from the 'National Emissions Inventory (NEI)' database.
# Specifically the table 'PM2.5 Emissions Data' (summarySCC_PM25.rds) was used,
# which can be downloaded from 'UC Irvine Machine Learning Repository' through this link:
# "https://d396qusza40orc.cloudfront.net/exdata%2Fdata%2Fhousehold_power_consumption.zip"
# The script will download the data file, if it doesn't exists in the working directory.
# More informations on the dataset can be found on the link:
# https://www.epa.gov/air-emissions-inventories
# ABOUT THIS SCRIPT
# The script was created:
# - in RStudio Version 1.1.383
# - with R version 3.4.4
# -- used dplyr version 0.7.4
# -- used tidyr version 0.8.0
# -- used ggplot2 version 2.2.1
# -- used gridExtra version 2.3
# and the data was downloaded at 24 April 2018.
# When the steps get executed informative messages, appear in console.
message("EXECUTING THE SCRIPT: 'plot3.R'")
################################################################################
# STEP 0: Loads the required libraries
################################################################################
# To create all data frames needed to construct the components of the multiplot.
library(dplyr)
library(tidyr)
# To create all elementary plots that compose the multiplot.
library(ggplot2)
# To combine the different plots, in one multiplot.
library(gridExtra)
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 0: Loads the required libraries.")
################################################################################
# STEP 1: Downloads and extracts the data file if it doesn't exist
################################################################################
# The file 'summarySCC_PM25.rds',
# should be in the working directory for the STEP 2,
# when data will be loaded in R.
# If it doesn't exist, it is downloaded and extracted in this step.
# (informative messages explain the situation to the user)
if (!file.exists(file = "summarySCC_PM25.rds")) {
# downloads the zipped data file as 'data.zip'
message("The file summarySCC_PM25.rds ",
"doesn't exists in the working directory. \n",
"Trying to download the zipped data file ...")
zip_data_url <- "https://d396qusza40orc.cloudfront.net/exdata%2Fdata%2FNEI_data.zip"
download.file(url = zip_data_url,
destfile = "data.zip")
message("\t ... zipped data file was successfully downloaded.")
# unzips the 'data.zip' file
message("Trying to extract the data file...")
unzip("data.zip")
# removes the 'data.zip' file
file.remove("data.zip")
message("\t ... data file was successfully extracted ",
"and the zipped file removed. \n",
"The data file 'summarySCC_PM25.rds' ",
"is now present in the working directory.")
}
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 1: Downloads and extracts the data file if it doesn't exist.")
################################################################################
# STEP 2: Loads the data file in R
################################################################################
# Data Table: 'PM2.5 Emissions Data'
NEI <- readRDS("summarySCC_PM25.rds")
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 2: Loads the data file in R.")
################################################################################
# STEP 3: Subsets the target variables and observations from the data
################################################################################
# Filter the observations to get those about Baltimore City,
# subsets the 'year', the value of 'Emissions' and the the 'type' of source.
target_data <- NEI %>%
filter(fips == "24510") %>%
select(year, type, Emissions) %>%
mutate(year = as.factor(year),
type = as.factor(type))
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 3: Subsets the target variables and observations from the data.")
################################################################################
# STEP 4: Creates the data frames needed to construct each plot
################################################################################
# For 'figure_top_left' and 'figure_top_right', the data contains
# the total emissions in PM2.5 by year and type of source.
data_total_emissions <- target_data %>%
group_by(year, type) %>%
summarise(total_emissions = sum(Emissions)) %>%
ungroup()
# For 'figure_bottom_left', the data contains
# the changes in total emissions in PM2.5 by each type of source,
# for the periods 1999-2002, 2002-2005 and 2005-2008.
data_change_total <- target_data %>%
# The total emissions by each year and gets computed.
group_by(year, type) %>%
summarise(total_emissions = sum(Emissions)) %>%
# After getting the total emissions, the grouping variables change,
# so we need to ungroup the data, and regroup it only by the 'type' variable.
ungroup() %>%
group_by(type) %>%
# From the total values it calculates the change that
# happened in the periods 1999-2002, 2002-2005 and 2005-2009.
nest() %>%
mutate(data = lapply(data, function(df) {
# The main (anonymous) function used, is very simplistic,
# just takes a data frame with 2 variables/columns
# and uses the first one as the 'step' and the second as the 'value'.
step <- as.character(df[[1]])
value <- as.numeric(df[[2]])
last <- nrow(df)
# Because in some of the groups there are missing values
# for whole steps, some invalid periods can occur
# so the valid periods are manually specified.
valid_periods <- c("1999-2002", "2002-2005", "2005-2008")
# The output is a new data frame with two columns,
# the first is the period (between two steps)
# and the second is the difference of the values between those steps.
data.frame("period" = paste(step[-last], step[-1], sep = "-"),
"change" = value[-1] - value[-last]) %>%
# The invalid periods are filtered out.
filter(period %in% valid_periods) %>%
mutate(period = factor(period, levels = valid_periods))
})) %>%
unnest() %>%
ungroup()
# For 'figure_bottom_right', the data contains
# the logarithmic transformations of emissions in PM2.5: ln(Emissions + 1)
# by year and type of source.
data_log_emissions <- target_data %>%
mutate(log_emissions = log(Emissions + 1),
# The original 'Emissions' variable is no longer needed and dropped.
Emissions = NULL)
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 4: Creates the data frames needed to construct each plot.")
################################################################################
# STEP 5: Creates all elementary plots that compose the multiplot
################################################################################
# Creates a list that contains four plots, the components of the multiplot.
components_of_multiplot <- list(
# The first figure will explore how much was, the total emissions,
# by the type of sources, for each of the years 1999, 2002, 2005 and 2009,
# in a linechart that presents all types together for comparison.
"figure_top_left" = ggplot(data_total_emissions, aes(x = year, y = total_emissions)) +
geom_point(aes(color = type), size = 2) +
scale_color_discrete(name = "Type of Source") +
geom_line(aes(color = type, group = type)) +
labs(title = paste0("Total Emissions from PM2.5",
"by Type of Source, in Baltimore City")) +
xlab(label = "") +
ylab(label = "") +
theme_bw(base_size = 10),
# The second figure will explore how much was the total emissions,
# by the type of sources, for each of the years 1999, 2002, 2005 and 2009,
# in four distinct bar-charts, one for each type.
"figure_top_right "= ggplot(data_total_emissions, aes(x = year, y = total_emissions)) +
geom_col(aes(fill = type), position = "dodge", show.legend = FALSE) +
labs(title = paste0("Total Emissions from PM2.5 ",
"by Type of Source in Baltimore City")) +
xlab(label = "") +
ylab(label = "") +
facet_wrap(~ type, nrow = 1) +
theme_linedraw(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)),
# The third figure will explore how much the changes in total emissions was,
# by the type of sources, for the periods 1999-2002, 2002-2005 and 2005-2009,
# in four distinct bar-charts, one for each type.
"figure_bottom_left" = ggplot(data_change_total, aes(x = period, y = change)) +
geom_col(aes(fill = change)) +
scale_fill_gradient2(name = paste0("Changes in Emissions \n",
"from PM2.5 \n",
"(in thousand of tons)"),
low = "turquoise", mid = "black", high = "indianred",
midpoint = 0) +
labs(title = paste0("Changes in Total Emissions from PM2.5 \n",
"over 3-year periods by Type of Source ",
"in Baltimore City")) +
xlab(label = "") +
ylab(label = "") +
facet_wrap(~ type, ncol = 4) +
theme_linedraw(base_size = 9) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)),
# The fourth figure will present in box-plots,
# the logarithmic transformation for Emissions: ln(Emissions + 1),
# that will expose in which type of source, the bigger changes happened.
"figure_bottom_right" = ggplot(data_log_emissions, aes(x = year, y = log_emissions)) +
geom_boxplot(aes(fill = type), show.legend = FALSE) +
labs(title = paste0("Logarithm of Emissions from PM2.5 ",
"by Type of Source in Baltimore City"),
subtitle = paste("The figure amplifies the changes happened ",
"in Emissions over the years \n",
"!! (It shouldn't be used to conclude about",
"the magnitude of those changes)")) +
xlab(label = "") +
ylab(label = "\n Log10(Emissions + 1)") +
facet_wrap( ~ type, ncol = 4) +
theme_linedraw(base_size = 9) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
)
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 5: Creates all elementary plots that compose the multiplot.")
################################################################################
# STEP 6: Combines the plots, to create the multiplot,
# and saves it in the working directory
################################################################################
# Construct the multiplot from the four plots,
# contained in the list created in STEP 5.
multiplot <- grid.arrange(grobs = components_of_multiplot,
nrow = 2,
# Adds the main question at the top of the multiplot.
top = paste0("QUESTION 3: \n",
"Of the four types of sources indicated by the type",
"(point, nonpoint, onroad, nonroad) variable \n",
"which of these four sources have seen decreases ",
"in emissions from 1999–2008 for Baltimore City? \n",
"Which have seen increases in emissions ",
"from 1999–2008? \n",
"(Use the ggplot2 plotting system to make a plot ",
"answer this question.) \n"),
# Adds a common label for both plots,
# 'figure_top_left' and 'figure_bottom_right',
# at the left of the multiplot.
left = "Total Emissions from PM2.5 (in thousands of tons)"
)
# Opens a new png graphical device, with resolution 800x640 pixels,
# plots the multiplot and saves it as 'plot6.png' in the working directory.
png(filename = "plot3.png", width = 800, height = 640)
plot(multiplot)
dev.off()
# A message that is expected to appear, when this STEP had been executed successfully.
message(" DONE --> STEP 6: Combines the plots, to create the multiplot, ",
"and saves it in the working directory.")
# A message that is expected to appear, when the script had been sourced successfully.
message("SUCCESSfully created 'plot3.png' in the working directory.")
# THE END ######################################################################