final tweaks for production

kyleviloria · Mar 2, 2019 · 4383481 · 4383481
1 parent 771317e
commit 4383481
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 125 deletions.
diff --git a/boxplots_violins.Rmd b/boxplots_violins.Rmd
@@ -42,9 +42,11 @@ lincoln_weather %>%
 
 
 lincoln_errbar <- ggplot(lincoln_df, aes(x = month_short, y = `Mean Temperature [F]`)) +
-  stat_summary(fun.y = mean, fun.ymax = function(x) {mean(x) + 2*sd(x)},
-               fun.ymin = function(x) {mean(x) - 2*sd(x)}, geom = "pointrange",
-               fatten = 5) +
+  stat_summary(
+    fun.y = mean, fun.ymax = function(x) {mean(x) + 2*sd(x)},
+    fun.ymin = function(x) {mean(x) - 2*sd(x)}, geom = "pointrange",
+    fatten = 5
+  ) +
   xlab("month") + 
   ylab("mean temperature (°F)") +
   theme_dviz_open() +

diff --git a/choosing_visualization_software.Rmd b/choosing_visualization_software.Rmd
@@ -26,36 +26,57 @@ Throughout this book, we have seen many examples of figures that reproduce but d
 
 (ref:lincoln-repro) Repeat and reproduction of a figure. Part (a) is a near-complete repeat of Figure \@ref(fig:lincoln-temp-jittered). With exception of the exact sizes of the text elements and points, which were adjusted so the figure remains legible at the reduced size, the two figures are identical down to the random jitter that was applied to each point. By contrast, part (b) is a reproduction but not a repeat. In particular, the jitter in part (b) differs from the jitter in part (a) or in Figure \@ref(fig:lincoln-temp-jittered).
 
-```{r lincoln-repro, fig.width = 8.5, fig.asp = .32, fig.cap = '(ref:lincoln-repro)'}
-ggridges::lincoln_weather %>% mutate(month_short = fct_recode(Month,
-                                                    Jan = "January",
-                                                    Feb = "February",
-                                                    Mar = "March",
-                                                    Apr = "April",
-                                                    May = "May",
-                                                    Jun = "June",
-                                                    Jul = "July",
-                                                    Aug = "August",
-                                                    Sep = "September",
-                                                    Oct = "October",
-                                                    Nov = "November",
-                                                    Dec = "December")) %>%
+```{r lincoln-repro, fig.width = 5.5*6/4.2, fig.asp = .32, fig.cap = '(ref:lincoln-repro)'}
+ggridges::lincoln_weather %>% 
+  mutate(
+    month_short = fct_recode(
+      Month,
+      Jan = "January",
+      Feb = "February",
+      Mar = "March",
+      Apr = "April",
+      May = "May",
+      Jun = "June",
+      Jul = "July",
+      Aug = "August",
+      Sep = "September",
+      Oct = "October",
+      Nov = "November",
+      Dec = "December"
+    )
+  ) %>%
   mutate(month_short = fct_rev(month_short)) -> lincoln_df
 
-lincoln1 <- ggplot(lincoln_df, aes(x = month_short, y = `Mean Temperature [F]`)) +
-  geom_point(position = position_jitter(width = .15, height = 0, seed = 320),
-             size = .5) +
+lincoln1 <- ggplot(
+  lincoln_df, aes(x = month_short, y = `Mean Temperature [F]`)
+) +
+  geom_point(
+    position = position_jitter(width = .15, height = 0, seed = 320),
+    size = .35
+  ) +
   xlab("month") + 
   ylab("mean temperature (°F)") +
-  theme_dviz_open(12) + theme(plot.margin = margin(3, 12, 3, 1.5))
-
-lincoln2 <- ggplot(lincoln_df, aes(x = month_short, y = `Mean Temperature [F]`)) +
-  geom_point(position = position_jitter(width = .2, height = 0, seed = 321),
-             size = .5, color = darken("#0072B2", .3)) +
+  theme_dviz_open(.655*14) + 
+  theme(
+    axis.line = element_line(size = .655*.5),
+    axis.ticks = element_line(size = .655*.5),
+    plot.margin = margin(2, 6, 2, 1.5)
+  )
+
+lincoln2 <- ggplot(
+  lincoln_df, aes(x = month_short, y = `Mean Temperature [F]`)
+) +
+  geom_point(
+    position = position_jitter(width = .2, height = 0, seed = 321),
+    size = .5, color = darken("#0072B2", .3)
+  ) +
   xlab("month") + 
   ylab("mean temperature (°F)") +
-  theme_dviz_grid(12) + theme(axis.ticks.length = grid::unit(0, "pt"),
-                                 axis.ticks = element_blank())
+  theme_dviz_grid(12) + 
+  theme(
+    axis.ticks.length = grid::unit(0, "pt"),
+    axis.ticks = element_blank()
+  )
 
 plot_grid(lincoln1, lincoln2, labels = "auto", ncol = 2)
 ```
@@ -89,40 +110,60 @@ In the software I have used for this book, ggplot2, separation of content and de
 
 (ref:unemploy-themes) Number of unemployed persons in the U.S. from 1970 to 2015. The same figure is displayed using four different ggplot2 themes: (a) the default theme for this book; (b) the default theme of ggplot2, the plotting software I have used to make all figures in this book; (c) a theme that mimicks visualizations shown in the Economist; (d) a theme that mimicks visualizations shown by FiveThirtyEight. FiveThirtyEight often foregos axis labels in favor of plot titles and subtitles, and therefore I have adjusted the figure accordingly. Data source: U.S. Bureau of Labor Statistics
 
-```{r unemploy-themes, fig.width = 8.5, fig.asp = 0.75, fig.cap = '(ref:unemploy-themes)'}
+```{r unemploy-themes, fig.width = 5.5*6/4.2, fig.asp = 0.75, fig.cap = '(ref:unemploy-themes)'}
 unemploy_base <- ggplot(economics, aes(x = date, y = unemploy)) +
-  scale_y_continuous(name = "unemployed (x1000)",
-                     limits = c(0, 17000),
-                     breaks = c(0, 5000, 10000, 15000),
-                     labels = c("0", "5000", "10,000", "15,000"),
-                     expand = c(0.04, 0)) +
-  scale_x_date(name = "year",
-               expand = c(0.01, 0))
-
-unemploy_p1 <- unemploy_base + theme_dviz_grid(12) +
+  scale_y_continuous(
+    name = "unemployed (x1000)",
+    limits = c(0, 17000),
+    breaks = c(0, 5000, 10000, 15000),
+    labels = c("0", "5000", "10,000", "15,000"),
+    expand = c(0.04, 0)
+  ) +
+  scale_x_date(
+    name = "year",
+    expand = c(0.01, 0)
+  )
+
+unemploy_p1 <- unemploy_base + 
+  theme_dviz_grid(12) +
   geom_line(color = "#0072B2") +
-  theme(axis.ticks.length = grid::unit(0, "pt"),
-        axis.ticks = element_blank(),
-        plot.margin = margin(6, 6, 6, 2))
+  theme(
+    axis.ticks.length = grid::unit(0, "pt"),
+    axis.ticks = element_blank(),
+    plot.margin = margin(6, 6, 6, 2)
+  )
+
 unemploy_p2 <- unemploy_base + geom_line() + theme_gray()
-unemploy_p3 <- unemploy_base + geom_line(aes(color = "unemploy"), show.legend = FALSE, size = .75) +
-  theme_economist() + scale_color_economist() +
+
+unemploy_p3 <- unemploy_base + 
+  geom_line(aes(color = "unemploy"), show.legend = FALSE, size = .75) +
+  theme_economist() + 
+  scale_color_economist() +
   theme(panel.grid.major = element_line(size = .75))
-unemploy_p4 <- unemploy_base + geom_line(aes(color = "unemploy"), show.legend = FALSE) +
+
+unemploy_p4 <- unemploy_base + 
+  geom_line(aes(color = "unemploy"), show.legend = FALSE) +
   scale_color_fivethirtyeight() +
-  labs(title = "United States unemployment",
-       subtitle = "Unemployed persons (in thousands) from 1967\nto 2015") +
+  labs(
+    title = "United States unemployment",
+    subtitle = "Unemployed persons (in thousands) from\n1967 to 2015"
+  ) +
   theme_fivethirtyeight() +
-  theme(plot.title = element_text(size = 14))
-
-plot_grid(unemploy_p1, NULL, unemploy_p2, 
-          NULL, NULL, NULL,
-          unemploy_p3, NULL, unemploy_p4,
-          labels = c("a", "", "b", "", "", "", "c", "", "d"),
-          hjust = -.5,
-          vjust = 1.5,
-          rel_widths = c(1, .02, 1),
-          rel_heights = c(1, .02, 1))
+  theme(
+    plot.title = element_text(size = 12, margin = margin(0, 0, 3, 0)),
+    plot.subtitle = element_text(size = 10, lineheight = 1)
+  )
+
+plot_grid(
+  unemploy_p1, NULL, unemploy_p2, 
+  NULL, NULL, NULL,
+  unemploy_p3, NULL, unemploy_p4,
+  labels = c("a", "", "b", "", "", "", "c", "", "d"),
+  hjust = -.5,
+  vjust = 1.5,
+  rel_widths = c(1, .02, 1),
+  rel_heights = c(1, .02, 1)
+)
 ```
 
 Separation of content and design allows data scientists and designers to each focus on what they do best. Most data scientists are not designers, and therefore their primary concern should be the data, not the design of a visualization. Likewise, most designers are not data scientists, and they should be able provide a unique and appealing visual language for figures without having to worry about specific data, appropriate transformations, and so on. The same principle of separating content and design has long been followed in the publishing world of books, magazines, newspapers, and websites, where writers provide content but not layout or design. Layout and design are created by a separate group of people who specialize in this area and who ensure that the publication appears in a visually consistent and appealing style. This principle is logical and useful, but it is not yet that widespread in the data visualization world.

diff --git a/image_file_formats.Rmd b/image_file_formats.Rmd
@@ -32,7 +32,7 @@ Vector graphics are also called "resolution-independent," because they can be ma
 
 (ref:bitmap-zoom) Illustration of the key difference between vector graphics and bitmaps. (a) Original image. The black square indicates the area we are magnifying in parts (b) and (c). (b) Increasing magnification of the highlighted area from part (a) when the image has been stored as a bitmap graphic. We can see how the image becomes increasingly pixelated as we zoom in further. (c) Increasing magnification of a vector representation of the image. The image maintains perfect sharpness at arbitrary magnification levels.
 
-```{r bitmap-zoom, fig.width = 8.5, fig.asp = 1/2.8, fig.cap='(ref:bitmap-zoom)'}
+```{r bitmap-zoom, fig.width = 5.5*6/4.2, fig.asp = 1/2.8, fig.cap='(ref:bitmap-zoom)'}
 library(magick)
 
 # support functions ---------------------------------------------------------

diff --git a/no_3d.Rmd b/no_3d.Rmd
@@ -39,7 +39,7 @@ Consider a 3D scatter plot of fuel efficiency versus displacement and power for
 (ref:mtcars-3d) Fuel efficiency versus displacement and power for 32 cars (1973–74 models). Each dot represents one car, and the dot color represents the number of cylinders of the car. The four panels (a)--(d) show exactly the same data but use different perspectives. Data source: *Motor Trend,* 1974.
 
 
-```{r mtcars-3d, fig.asp = 1.1, fig.cap = '(ref:mtcars-3d)'}
+```{r mtcars-3d, fig.width = 5*6/4.2, fig.asp = 1.1, fig.cap = '(ref:mtcars-3d)'}
 library(plot3D)
 library(cowplot)
 set_null_device("png")
@@ -48,25 +48,38 @@ colors <- c("#0072B2", "#CC79A7", "#E69F00")
 
 cyls <- data.frame(cyl = factor(c(4, 6, 8)))
 
-p <- ggplot(cyls, aes(cyl, cyl, color = cyl)) + geom_point() +
-  scale_color_manual(values = colors, name = "cylinders") +
-  theme_dviz_open(font_size = 12, font_family = dviz_font_family) +
-  theme(legend.position = "top",
-        legend.justification = "right")
+p <- ggplot(cyls, aes(cyl, cyl, color = cyl)) + 
+  geom_point(size = 2.5) +
+  scale_color_manual(
+    values = colors,
+    name = "cylinders ",
+    labels = c("4  ", "6  ", "8")
+  ) +
+  theme_dviz_open(font_size = 14, rel_small = 1, font_family = dviz_font_family) +
+  theme(
+    legend.position = "top",
+    legend.justification = "right",
+    legend.key.width = unit(4, "pt")
+  )
 legend <- get_legend(p)
 
 pfun <- function(theta = 30, phi = 20) {
   function() {
-    par(xpd = NA,
-        bg = "transparent",
-        mai = c(0, 0.1, 0, 0),
-        family = dviz_font_family_condensed
+    par(
+      xpd = NA,
+      bg = "transparent",
+      mai = c(0, 0.1, 0, 0),
+      family = dviz_font_family_condensed
+    )
+    scatter3D(
+      mtcars$disp, mtcars$hp, mtcars$mpg, colvar = mtcars$cyl,
+      col = colors,
+      pch = 19, bty ="b2", theta = theta, phi = phi, colkey = FALSE, 
+      xlab = "displacement (cu. in.)",
+      ylab ="power (hp)",
+      zlab = "efficiency (mpg)",
+      cex.lab = 1.17
     )
-    scatter3D(mtcars$disp, mtcars$hp, mtcars$mpg, colvar = mtcars$cyl,
-              col = colors,
-              pch = 19, bty ="b2", theta = theta, phi = phi, colkey = FALSE, 
-              xlab = "displacement (cu. in.)", ylab ="power (hp)", zlab = "efficiency (mpg)",
-              cex.lab = 1) #0.857)
   }
 }
 
@@ -84,28 +97,34 @@ Our visual system nevertheless attempts to invert the 3D to 2D transformation. H
 
 (ref:mtcars-3d-no-axes) Fuel efficiency versus displacement and power for 32 cars (1973–74 models). The four panels (a)--(d) correspond to the same panels in Figure \@ref(fig:mtcars-3d), only that all grid lines providing depth cues have been removed. Data source: *Motor Trend,* 1974.
 
-```{r mtcars-3d-no-axes, fig.asp = 1.1, fig.cap = '(ref:mtcars-3d-no-axes)'}
+```{r mtcars-3d-no-axes, fig.width = 5*6/4.2, fig.asp = 1.1, fig.cap = '(ref:mtcars-3d-no-axes)'}
 pfun2 <- function(theta = 30, phi = 20) {
   function() {
-    par(xpd = NA,
-        bg = "transparent",
-        mai = c(0, 0.1, 0, 0),
-        family = dviz_font_family_condensed
+    par(
+      xpd = NA,
+      bg = "transparent",
+      mai = c(0, 0.1, 0, 0),
+      family = dviz_font_family_condensed
+    )
+    scatter3D(
+      mtcars$disp, mtcars$hp, mtcars$mpg, colvar = mtcars$cyl,
+      col = colors,
+      pch = 19, axes = FALSE, theta = theta, phi = phi,
+      colkey = FALSE, box = FALSE,
+      cex.lab = 1.17
     )
-    scatter3D(mtcars$disp, mtcars$hp, mtcars$mpg, colvar = mtcars$cyl,
-              col = colors,
-              pch = 19, axes = FALSE, theta = theta, phi = phi, colkey = FALSE, box = FALSE,
-              cex.lab = 1) #0.857)
   }
 }
 
 
-plot_grid(pfun2(30, 20), pfun2(-30, 20), 
-          NULL, legend,
-          pfun2(30, 40), pfun2(-30, 40),
-          rel_heights = c(1, 0.1, 1), ncol = 2,
-          labels = c("a", "b", "", "", "c", "d"),
-          label_fontface = "plain", label_fontfamily = dviz_font_family)
+plot_grid(
+  pfun2(30, 20), pfun2(-30, 20), 
+  NULL, legend,
+  pfun2(30, 40), pfun2(-30, 40),
+  rel_heights = c(1, 0.1, 1), ncol = 2,
+  labels = c("a", "b", "", "", "c", "d"),
+  label_fontface = "plain", label_fontfamily = dviz_font_family
+)
 
 ```
 
@@ -115,7 +134,7 @@ Here, I want to show two alternative ways of plotting exactly the variables used
 
 (ref:mtcars-2d-multiple) Fuel efficiency versus displacement (a) and power (b). Data source: *Motor Trend,* 1974.
 
-```{r mtcars-2d-multiple, fig.asp = .45, fig.cap = '(ref:mtcars-2d-multiple)'}
+```{r mtcars-2d-multiple, fig.width = 5*6/4.2, fig.asp = .45, fig.cap = '(ref:mtcars-2d-multiple)'}
 p1 <- ggplot(mtcars, aes(x = disp, y = mpg, color = factor(cyl))) +
   geom_point(size = 1.5) + 
   scale_color_manual(values = colors, name = "cylinders", guide = "none") +
@@ -132,11 +151,12 @@ p2 <- ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) +
   theme(
     legend.position = c(1, 1),
     legend.justification = c(1, 1),
-    legend.spacing.y = grid::unit(3, "pt")
+    legend.spacing.y = grid::unit(3, "pt"),
+    legend.key.width = unit(6, "pt")
   )
 
 
-plot_grid(p1, p2, labels = "auto")
+plot_grid(p1, align_legend(p2), labels = "auto")
 ```
 
 (ref:mtcars-2d-size) Power versus displacement for 32 cars, with fuel efficiency represented by dot size. Data source: *Motor Trend,* 1974.
@@ -170,7 +190,7 @@ You may wonder whether the problem with 3D scatter plots is that the actual data
 
 (ref:VA-death-rates-3d) Mortality rates in Virginia in 1940, visualized as a 3D bar plot. Mortality rates are shown for four groups of people (urban and rural females and males) and five age categories (50--54, 55--59, 60--64, 65--69, 70--74), and they are reported in units of deaths per 1000 persons. This figure is labeled as "bad" because the 3D perspective makes the plot difficult to read. Data source: @Molyneaux-et-al-1947
 
-```{r VA-death-rates-3d, fig.width = 4.64, fig.asp = 0.8, fig.cap = '(ref:VA-death-rates-3d)'}
+```{r VA-death-rates-3d, fig.width = 5, fig.asp = 0.8, fig.cap = '(ref:VA-death-rates-3d)'}
 pfun3 <- function() {
 
   par(xpd = NA,
@@ -216,15 +236,19 @@ In general, it is better to use Trellis plots (Chapter \@ref(multi-panel-figures
 
 (ref:VA-death-rates-Trellis) Mortality rates in Virginia in 1940, visualized as a Trellis plot. Mortality rates are shown for four groups of people (urban and rural females and males) and five age categories (50--54, 55--59, 60--64, 65--69, 70--74), and they are reported in units of deaths per 1000 persons.  Data source: @Molyneaux-et-al-1947
 
-```{r VA-death-rates-Trellis, fig.cap = '(ref:VA-death-rates-Trellis)'}
+```{r VA-death-rates-Trellis, fig.width = 5*6/4.2, fig.cap = '(ref:VA-death-rates-Trellis)'}
 df <- data.frame(VADeaths)
 df$age <- row.names(df)
 row.names(df) <- NULL
 df_long <- gather(df, type, rate, -age) %>%
-  mutate(type = case_when(type == "Urban.Male" ~ "urban male",
-                          type == "Urban.Female" ~ "urban female",
-                          type == "Rural.Male" ~ "rural male",
-                          type == "Rural.Female" ~ "rural female"))
+  mutate(type = 
+    case_when(
+      type == "Urban.Male" ~ "urban male",
+      type == "Urban.Female" ~ "urban female",
+      type == "Rural.Male" ~ "rural male",
+      type == "Rural.Female" ~ "rural female"
+    )
+  )
 ggplot(df_long, aes(age, rate)) + 
   geom_col(fill = "#56B4E9D0") + 
   facet_wrap(~type) +