fixes from copy edit

kyleviloria · Feb 11, 2019 · a3de8b5 · a3de8b5
1 parent 9dd33c1
commit a3de8b5
Show file tree

Hide file tree

Showing 13 changed files with 50 additions and 48 deletions.
diff --git a/directory_of_visualizations.Rmd b/directory_of_visualizations.Rmd
@@ -361,12 +361,12 @@ df_multi_dist_small <- group_by(df_multi_dist, type) %>%
 
 p3 <- ggplot(df_multi_dist_small, aes(x = type, y = y)) + 
   geom_jitter(color = palette[1], width = 0.15, height = 0, size = .3) +
-  labs(title = "strip chart") +
+  labs(title = "strip charts") +
   theme_plot_icon_hgrid(palette[npal], palette[1])
 
 p4 <- ggplot(df_multi_dist_small, aes(x = type, y = y)) + 
   dviz.supp::stat_sina(color = palette[1], size = 0.3) +
-  labs(title = "sina plot") +
+  labs(title = "sina plots") +
   theme_plot_icon_hgrid(palette[npal], palette[1])
 
 p5 <- ggplot(df_multi_dist, aes(x = y, fill = factor(type, levels = c("C", "A", "B")))) + 
@@ -585,7 +585,7 @@ p1 <- ggplot(df_scatter_xy, aes(x, y)) +
   geom_point(fill = palette[2], color = palette[npal], pch = 21, size = 2.4) + 
   scale_x_continuous(expand = c(.2, 0)) +
   scale_y_continuous(expand = c(.2, 0)) +
-  labs(title = "scatter plot") +
+  labs(title = "scatterplot") +
   theme_plot_icon(palette[npal], palette[1])
 
 p2 <- ggplot(df_scatter_xyz, aes(x, y, size = z)) + 
@@ -604,7 +604,7 @@ p3 <- ggplot(spread(df_paired, x, y), aes(A, B)) +
   ) +
   scale_x_continuous(limits = c(1.5, 6.5)) +
   scale_y_continuous(limits = c(1.5, 6.5)) +
-  labs(title = "paired scatter plot") +
+  labs(title = "paired scatterplot") +
   theme_plot_icon(palette[npal], palette[1])
 
 p4 <- ggplot(df_paired, aes(x, y, group = group)) + 
@@ -626,7 +626,7 @@ plot_grid(p1, p2, p3, p4, ncol = 4, scale = .9)
 ```
 
 
-Scatter plots represent the archetypical visualization when we want to show one quantitative variable relative to another (Chapter \@ref(associations-scatterplots)). If we have three quantitative variables, we can map one onto the dot size, creating a variant of the scatter plot called bubble chart. For paired data, where the variables along the *x* and the *y* axes are measured in the same units, it is generally helpful to add a line indicating *x* = *y* (Chapter \@ref(associations-paired-data)). Paired data can also be shown as a slope graph of paired points connected by straight lines (Chapter \@ref(associations-paired-data)).
+Scatterplots represent the archetypical visualization when we want to show one quantitative variable relative to another (Chapter \@ref(associations-scatterplots)). If we have three quantitative variables, we can map one onto the dot size, creating a variant of the scatterplot called bubble chart. For paired data, where the variables along the *x* and the *y* axes are measured in the same units, it is generally helpful to add a line indicating *x* = *y* (Chapter \@ref(associations-paired-data)). Paired data can also be shown as a slope graph of paired points connected by straight lines (Chapter \@ref(associations-paired-data)).
 
 ```{r xy-binning, fig.width = 6, fig.asp = 1/4}
 p5 <- ggplot(df_dense_scatter, aes(x, y)) + 
@@ -701,7 +701,7 @@ p2 <- ggplot(df_connected_scatter, aes(x, y, color = t, fill = t)) +
   ) +
   scale_x_continuous(limits = c(0.3, 3.7)) +
   scale_y_continuous(limits = c(-2.5, 2.5)) +
-  labs(title = "connected scatter plot") +
+  labs(title = "connected scatterplot") +
   theme_plot_icon(palette[npal], palette[1])
 
 p3 <- ggplot(df_dense_scatter_sample, aes(x, y)) +
@@ -719,7 +719,7 @@ p3 <- ggplot(df_dense_scatter_sample, aes(x, y)) +
 plot_grid(p1, p2, p3, ncol = 4, scale = .9)
 ```
 
-When the *x* axis represents time or a strictly increasing quantity such as a treatment dose, we commonly draw line graphs (Chapter \@ref(time-series)). If we have a temporal sequence of two response variables, we can draw a connected scatter plot where we first plot the two response variables in a scatter plot and then connect dots corresponding to adjacent time points (Chapter \@ref(time-series-connected-scatter)). We can use smooth lines to represent trends in a larger dataset (Chapter \@ref(visualizing-trends)). 
+When the *x* axis represents time or a strictly increasing quantity such as a treatment dose, we commonly draw line graphs (Chapter \@ref(time-series)). If we have a temporal sequence of two response variables, we can draw a connected scatterplot where we first plot the two response variables in a scatterplot and then connect dots corresponding to adjacent time points (Chapter \@ref(time-series-connected-scatter)). We can use smooth lines to represent trends in a larger dataset (Chapter \@ref(visualizing-trends)). 
 
 
 ## Geospatial data {#directory-geospatial-data}
@@ -971,7 +971,7 @@ p3 <- ggplot(df_uncertain, aes(y, type)) +
   ) +
   scale_x_continuous(limits = c(1.6, 6.4), expand = c(0, 0)) +
   scale_y_discrete(expand = expand_scale(add = c(0.2, 0.8))) +
-  labs(title = "half eyes") +
+  labs(title = "half-eyes") +
   theme_plot_icon(palette[npal], palette[1]) +
   theme(
     axis.line.y = element_blank(),
@@ -995,7 +995,7 @@ p4 <- ggplot(df_q, aes(x)) +
     expand = c(0.02, 0),
     limits = c(0, 0.4)
   ) +
-  labs(title = "quantile dotplot") +
+  labs(title = "quantile dot plot") +
   theme_plot_icon(palette[npal], palette[1]) +
   theme(
     axis.line.y = element_blank(),
@@ -1006,7 +1006,7 @@ plot_grid(p1, p2, p3, p4, ncol = 4, scale = .9)
   
 ```
 
-To achieve a more detailed visualization than is possible with error bars or graded error bars, we can visualize the actual confidence or posterior distributions (Chapter \@ref(visualizing-uncertainty)). Confidence strips provide a clear visual sense of uncertainty but are difficult to read accurately. Eyes and half-eyes combine error bars with approaches to visualize distributions (violins and ridgelines, respectively), and thus show both precise ranges for some confidence levels and the overall uncertainty distribution. A quantile dotplot can serve as an alternative visualization of an uncertainty distribution (Chapter \@ref(frequency-framing)). By showing the distribution in discrete units, the quantile dotplot is not as precise but can be easier to read than the continuous distribution shown by a violin or ridgeline plot.
+To achieve a more detailed visualization than is possible with error bars or graded error bars, we can visualize the actual confidence or posterior distributions (Chapter \@ref(visualizing-uncertainty)). Confidence strips provide a clear visual sense of uncertainty but are difficult to read accurately. Eyes and half-eyes combine error bars with approaches to visualize distributions (violins and ridgelines, respectively), and thus show both precise ranges for some confidence levels and the overall uncertainty distribution. A quantile dot plot can serve as an alternative visualization of an uncertainty distribution (Chapter \@ref(frequency-framing)). By showing the distribution in discrete units, the quantile dot plot is not as precise but can be easier to read than the continuous distribution shown by a violin or ridgeline plot.
 
 ```{r confidence-bands, fig.width = 6, fig.asp = 1/4}
 p1 <- ggplot(df_dense_scatter_sample, aes(x, y)) +

diff --git a/figure_titles_captions.Rmd b/figure_titles_captions.Rmd
@@ -27,7 +27,7 @@ corruption %>% filter(year == 2015) %>% na.omit() %>%
   mutate(region = case_when(
     region == "Middle East and North Africa" ~ "Middle East\nand North Africa",
     region == "Europe and Central Asia" ~ "Europe and\nCentral Asia",
-    region == "Sub Saharan Africa" ~ "Sub Saharan\nAfrica",
+    region == "Sub Saharan Africa" ~ "Sub-Saharan\nAfrica",
     TRUE ~ region),
     label = ifelse(country %in% country_highlight, country, "")
     ) %>%

diff --git a/figures/jpeg_example_combined.idraw b/figures/jpeg_example_combined.idraw
diff --git a/figures/jpeg_example_combined.pdf b/figures/jpeg_example_combined.pdf
diff --git a/figures/jpeg_example_combined.png b/figures/jpeg_example_combined.png
diff --git a/no_3d.Rmd b/no_3d.Rmd
@@ -136,7 +136,7 @@ p2 <- ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) +
   )
 
 
-plot_grid(p1, p2)
+plot_grid(p1, p2, labels = "auto")
 ```
 
 (ref:mtcars-2d-size) Power versus displacement for 32 cars, with fuel efficiency represented by dot size. Data source: *Motor Trend,* 1974.

diff --git a/proportional_ink.Rmd b/proportional_ink.Rmd
@@ -14,7 +14,7 @@ library(treemapify)
 
 # The principle of proportional ink {#proportional-ink}
 
-In many different visualization scenarios, we represent data values by the extent of a graphical element. For example, in a bar plot, we draw bars that begin at 0 and end at the data value they represent. In this case, the data value is not only encoded in the end point of the bar but also in the height or length of the bar. If we drew a bar that started at a different value than 0, then the length of the bar and the bar endpoint would convey contradicting information. I consider such visualizations not just "bad" but "wrong," because they are internally inconsistent. They show two different values with the same graphical element. Contrast this to a scenario where we visualize the data value with a dot. In this case, the value is only encoded in the location of the dot but not in the size or shape of the dot.
+In many different visualization scenarios, we represent data values by the extent of a graphical element. For example, in a bar plot, we draw bars that begin at 0 and end at the data value they represent. In this case, the data value is not only encoded in the end point of the bar but also in the height or length of the bar. If we drew a bar that started at a different value than 0, then the length of the bar and the bar endpoint would convey contradicting information. Such figures are internally inconsistent, because they show two different values with the same graphical element. Contrast this to a scenario where we visualize the data value with a dot. In this case, the value is only encoded in the location of the dot but not in the size or shape of the dot.
 
 Similar issues will arise whenever we use graphical elements such as bars, rectangles, shaded areas of arbitrary shape, or any other elements that have a clear visual extent which can be either consistent or inconsistent with the data value shown. In all these cases, we need to make sure that there is no inconsistency. This concept has been termed by Bergstrom and West as the *principle of proportional ink* [@BergstromWest2016]:
 
@@ -26,7 +26,7 @@ Similar issues will arise whenever we use graphical elements such as bars, recta
 
 We first consider the most common scenario, visualization of amounts along a linear scale. Figure \@ref(fig:hawaii-income-bars-bad) shows the median income in the five counties that make up the state of Hawaii. It is a typical figure one might encounter in a newspaper article. A quick glance at the figure suggests that the county of Hawaii is incredibly poor while the county of Honolulu is much richer than the other counties. However, Figure \@ref(fig:hawaii-income-bars-bad) is quite misleading, because all bars begin at \$50,000 median income. Thus, while the endpoint of each bar correctly represents the actual median income in each county, the bar height represents the extent to which median incomes exceed $50,000, an arbitrary number. And human perception is such that the bar height is the key quantity we perceive when looking at this figure, not the location of the bar endpoint relative to the *y* axis.
 
-(ref:hawaii-income-bars-bad) Median income in the five counties of the state of Hawaii. This figure is incorrect, because the *y* axis scale starts at \$50,000 instead of \$0. As a result, the bar heights are not proportional to the values shown, and the income differential between the county of Hawaii and the other four counties appears much bigger than it actually is. Data source: 2015 Five-Year American Community Survey.
+(ref:hawaii-income-bars-bad) Median income in the five counties of the state of Hawaii. This figure is misleading, because the *y* axis scale starts at \$50,000 instead of \$0. As a result, the bar heights are not proportional to the values shown, and the income differential between the county of Hawaii and the other four counties appears much bigger than it actually is. Data source: 2015 Five-Year American Community Survey.
 
 ```{r hawaii-income-bars-bad, fig.cap = '(ref:hawaii-income-bars-bad)'}
 p_income_base <- ggplot(filter(hawaii_income, year == 2015), aes(x = reorder(county, desc(median_income)), y = median_income)) +
@@ -46,7 +46,7 @@ p_income_bad <- p_income_base +
     labels = function(x) paste0("$", scales::comma(x))
   )
 
-stamp_wrong(p_income_bad)
+stamp_bad(p_income_bad)
 ```
 
 An appropriate visualization of these data makes for a less exciting story (Figure \@ref(fig:hawaii-income-bars-good)). While there are differences in median income between the counties, they are nowhere near as big as Figure \@ref(fig:hawaii-income-bars-bad) suggested. Overall, the median incomes in the different counties are somewhat comparable.
@@ -67,12 +67,12 @@ p_income_good
 ```
 
 ```{block type='rmdtip', echo=TRUE}
-Bars on a linear scale must always start at 0.
+Bars on a linear scale should always start at 0.
 ```
 
 Similar visualization problems frequently arise in the visualization of time series, such as those of stock prices. Figure \@ref(fig:fb-stock-drop-bad) suggests a massive collapse in the stock price of Facebook occurred around Nov. 1, 2016. In reality, the price decline was moderate relative to the total price of the stock (Figure \@ref(fig:fb-stock-drop-good)). The *y*-axis range in Figure \@ref(fig:fb-stock-drop-bad) would be questionable even without the shading undearneath the curve. But with the shading, the figure becomes particularly problematic. The shading emphasizes the distance from the location of the *x* axis to the specific *y* values shown, and thus it creates the visual impression that the height of the shaded area at a given day represents the stock price of that day. Instead, it only represents the difference in stock price from the baseline, which is $110 in Figure \@ref(fig:fb-stock-drop-bad).
 
-(ref:fb-stock-drop-bad) Stock price of Facebook (FB) from Oct. 22, 2016 to Jan. 21, 2017. This figure seems to imply that the Facebook stock pice collapsed around Nov. 1, 2016. However, this is misleading, because the *y* axis starts at $110 instead of $0.
+(ref:fb-stock-drop-bad) Stock price of Facebook (FB) from Oct. 22, 2016 to Jan. 21, 2017. This figure seems to imply that the Facebook stock price collapsed around Nov. 1, 2016. However, this is misleading, because the *y* axis starts at $110 instead of $0.
 
 ```{r fb-stock-drop-bad, fig.cap = '(ref:fb-stock-drop-bad)'}
 df_fb_drop <- filter(tech_stocks, ticker == "FB", date >= ymd("2016-10-22") & date < ymd("2017-01-22"))
@@ -90,7 +90,7 @@ fb_drop_bad <- ggplot(df_fb_drop, aes(x=date, height=price - 110, y = 110)) +
   background_grid(major = 'y', minor = 'none') +
   theme(plot.margin = margin(14, 7, 3, 1.5))
 
-stamp_wrong(fb_drop_bad)
+stamp_bad(fb_drop_bad)
 ```
 
 (ref:fb-stock-drop-good) Stock price of Facebook (FB) from Oct. 22, 2016 to Jan. 21, 2017. By showing the stock price on a *y* scale from $0 to $150, this figure more accurately relays the magnitude of the FB price drop around Nov. 1, 2016.
@@ -188,7 +188,7 @@ oc_bad <- ggplot(df_oceania, aes(x = reorder(country, -GDP), y = log10(GDP))) +
   theme(axis.ticks.y = element_blank(),
         plot.margin = margin(12, 6, 3, 1.5))
 
-stamp_wrong(oc_bad)
+stamp_bad(oc_bad)
 ```
 
 However, the visualization with bars on a log scale (Figure \@ref(fig:oceania-gdp-logbars)) does not work either. The bars start at an arbitrary value of 0.3 billion USD, and at a minimum the figure suffers from the same problem of Figure \@ref(fig:hawaii-income-bars-bad), that the bar lengths are not representative of the data values. The added difficulty with a log scale, though, is that we cannot simply let the bars start at 0. In Figure \@ref(fig:oceania-gdp-logbars), the value 0 would lie infinitely far to the left. Therefore, we could make our bars arbitrary long by pushing their origin further and further way, see e.g. Figure \@ref(fig:oceania-gdp-logbars-long). This problem always arises when we try to visualize amounts (which is what the GDP values are) on a log scale.
@@ -207,7 +207,7 @@ oc_bad2 <- ggplot(df_oceania, aes(x = reorder(country, -GDP), y = log10(GDP))) +
   theme(axis.ticks.y = element_blank(),
         plot.margin = margin(12, 6, 3, 1.5))
 
-stamp_wrong(oc_bad2)
+stamp_bad(oc_bad2)
 ```
 
 For the data of Figure \@ref(fig:oceania-gdp-logbars), I think bars are inappropriate. Instead, we can simply place a dot at the appropriate location along the scale for each country's GDP and avoid the issue of bar lengths altogether (Figure \@ref(fig:oceania-gdp-dots)). Importantly, by placing the country names right next to the dots rather than along the *y* axis, we avoid generating the visual perception of a magnitude conveyed by the distance from the country name to the dot.

diff --git a/small_axis_labels.Rmd b/small_axis_labels.Rmd
@@ -3,7 +3,7 @@
 source("_common.R")
 ```
 
-# Your axis labels are too small {#small-axis-labels}
+# Use larger axis labels {#small-axis-labels}
 
 If you take away only one single lesson from this book, make it this one: Pay attention to your axis labels, axis tick labels, and other assorted plot annotations. Chances are they are too small. In my experience, nearly all plot libraries and graphing softwares have poor defaults. If you use the default values, you're almost certainly making a poor choice.
 

diff --git a/time_series.Rmd b/time_series.Rmd
@@ -238,7 +238,7 @@ p1 <- ggplot(CA_house_prices, aes(date, house_price_perc)) +
   scale_y_continuous(
     limits = c(-0.3, .32), expand = c(0, 0),
     breaks = c(-.3, -.15, 0, .15, .3),
-    name = "12-month change\nin house prices", labels = scales::percent
+    name = "12-month change\nin house prices", labels = scales::percent_format(accuracy = 1)
   ) + 
   scale_x_date(name = "", expand = c(0, 0)) +
   coord_cartesian(clip = "off") +
@@ -252,7 +252,7 @@ p2 <- ggplot(CA_house_prices, aes(date, unemploy_perc/100)) +
   geom_line(size = 0.75, color = "#0072b2") +
   scale_y_continuous(
     limits = c(0.037, 0.143),
-    name = "unemploy-\nment rate", labels = scales::percent,
+    name = "unemploy-\nment rate", labels = scales::percent_format(accuracy = 1),
     expand = c(0, 0)
   ) +
   scale_x_date(name = "year", expand = c(0, 0)) +
@@ -285,13 +285,13 @@ ggplot(CA_house_prices) +
   ) +
   scale_x_continuous(
     limits = c(0.037, 0.143),
-    name = "unemployment rate", labels = scales::percent,
+    name = "unemployment rate", labels = scales::percent_format(accuracy = 1),
     expand = c(0, 0)
   ) +
   scale_y_continuous(
     limits = c(-0.315, .315), expand = c(0, 0),
     breaks = c(-.3, -.15, 0, .15, .3),
-    name = "12-month change in house prices", labels = scales::percent
+    name = "12-month change in house prices", labels = scales::percent_format(accuracy = 1)
   ) + 
   scale_colour_gradient(low = "#E7F0FF", high = "#035B8F") + #"#0072b2") +
   guides(colour = FALSE) +
@@ -316,13 +316,13 @@ p <- ggplot(CA_house_prices) +
   geom_path(size = 0.75, lineend = "round", color = "#0072b2") +
   scale_x_continuous(
     limits = c(0.037, 0.143),
-    name = "unemployment rate", labels = scales::percent,
+    name = "unemployment rate", labels = scales::percent_format(accuracy = 1),
     expand = c(0, 0)
   ) +
   scale_y_continuous(
     limits = c(-0.315, .315), expand = c(0, 0),
     breaks = c(-.3, -.15, 0, .15, .3),
-    name = "12-month change in house prices", labels = scales::percent
+    name = "12-month change in house prices", labels = scales::percent_format(accuracy = 1)
   ) + 
   coord_cartesian(clip = "off") +
   theme_dviz_grid() +
-Original file line number
+Diff line change
@@ Expand Up @@
       )
-    plot_grid(p1, p2)
+    plot_grid(p1, p2, labels = "auto")
     ```
     (ref:mtcars-2d-size) Power versus displacement for 32 cars, with fuel efficiency represented by dot size. Data source: *Motor Trend,* 1974.
@@ Expand Down @@