7 The `ggplot2` package

ggplot2 is one of the visualization tools that the R system has. The others are the Base R plotting functions and the lattice package. ggplot2 is the most evolved and complete plotting package. The components of a plot, include: - the data being plotted, a data frame, or tibble (tidy data frame) - the geometric objects (circles, lines, etc.) that appear on the plot - a set of mappings from variables in the data to the aesthetics (appearance) of the geometric objects: what column x,y is,the color, the size, etc… - a statistical transformation used to calculate the data values used in the plot - a position adjustment for locating each geometric object on the plot - a scale (e.g., range of values) for each aesthetic mapping used: color_manual, x_continuous, - a coordinate system used to organize the geometric objects - the facets or groups of data shown in different plots: wrap, grid - layers, where each layer has a single geometric object, statistical transformation, and position adjustment. You can think of each plot as a set of layers of images, - theme: theme_bw(), theme_light() - The typical call to ggplot()

ggplot(data=<data>, aes(x=<x>, y=<y>, color=<z>, size=<w>))+
geom_<geometry>()+
scale_<scales>()+
facet_<facets>()+
<theme>

There are hundreds of geometries and ways to plot the data. In summary, to create a plot we need to: - call ggplot function that creates a blank canvas - specify aesthetic mappings between variables and visual aspects - add new layers of geometric objects such as geom_point, geom_bar, etc.

Two examples from datasets available in the R system: mtcars and diamonds

7.1 Diamonds dataset

7.1.1 Exploratory data analysis

Visualising distributions

library(ggplot2)
library(dplyr) # or library(tidyverse)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

data("diamonds")
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))

# the height of the bars is the number of observations
diamonds %>% 
  dplyr::count(cut)

# A tibble: 5 × 2
  cut           n
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551

# For continuous variables we use the histogram
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% 
  count(cut_width(carat, 0.5))

# A tibble: 11 × 2
   `cut_width(carat, 0.5)`     n
   <fct>                   <int>
 1 [-0.25,0.25]              785
 2 (0.25,0.75]             29498
 3 (0.75,1.25]             15977
 4 (1.25,1.75]              5313
 5 (1.75,2.25]              2002
 6 (2.25,2.75]               322
 7 (2.75,3.25]                32
 8 (3.25,3.75]                 5
 9 (3.75,4.25]                 4
10 (4.25,4.75]                 1
11 (4.75,5.25]                 1

We may subset the data for plotting a smaller part of the data

smaller <- diamonds %>% 
  filter(carat < 3)

# set the width of the intervals in a histogram with the binwidth argument 
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)

# multiple histograms, using the variables carat and cut
ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1)

Identifying some specific points, outliers, etc. by changing the size of the x or y axis

# reducing the width of the binwith
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.01)

# all values in the x and y axis
ggplot(diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = 0.5)

# zoom to small values in the y-axis
ggplot(diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50))

# we identify those values 
unusual <- diamonds %>% 
  filter(y < 3 | y > 20) %>% 
  select(price, x, y, z) %>%
  arrange(y)
unusual

# A tibble: 9 × 4
  price     x     y     z
  <int> <dbl> <dbl> <dbl>
1  5139  0      0    0   
2  6381  0      0    0   
3 12800  0      0    0   
4 15686  0      0    0   
5 18034  0      0    0   
6  2130  0      0    0   
7  2130  0      0    0   
8  2075  5.15  31.8  5.12
9 12210  8.09  58.9  8.06

Removing extreme points and plot the new data

diamonds2 <- diamonds %>% 
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
  geom_point()

Warning: Removed 9 rows containing missing values or values outside the scale range
(`geom_point()`).

7.1.2 Boxplots

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

7.1.3 Two categorical variables

Different visualisations

ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))

diamonds %>% 
  count(color, cut)

# A tibble: 35 × 3
   color cut           n
   <ord> <ord>     <int>
 1 D     Fair        163
 2 D     Good        662
 3 D     Very Good  1513
 4 D     Premium    1603
 5 D     Ideal      2834
 6 E     Fair        224
 7 E     Good        933
 8 E     Very Good  2400
 9 E     Premium    2337
10 E     Ideal      3903
# ℹ 25 more rows

# different geometry
diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))

7.2 Plotting relationships diamonds

Simple plot of carats vs price

data("diamonds") # from ggplot2  ?diamonds
p <- ggplot(data = diamonds, aes(x = carat, y = price))
p + geom_point()

# alpha to add transparency
ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)

Plot the smaller subset with different geometries

ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price))

`stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.

# install.packages("hexbin")
ggplot(data = smaller) +
  geom_hex(mapping = aes(x = carat, y = price))

Warning: Computation failed in `stat_binhex()`.
Caused by error in `compute_group()`:
! The package "hexbin" is required for `stat_bin_hex()`.

Cutting data above or equal to 2 carats, adding color depending on the variable cut and adding some transparency to the points (alpha)

#carat < 2
data("diamonds")
p <- diamonds %>% filter(carat<2) %>% 
        ggplot(aes(x = carat, y = price, color = cut))
p + geom_point(alpha=0.5)

Adding some smooth splines

p + geom_point(alpha=0.5) + geom_smooth()

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Coloring points in a different way to understand the possible relationship

p <- diamonds %>% filter(carat<2) %>% 
        ggplot(aes(x = carat, y = price, color = clarity))
p + geom_point(alpha=0.5) + geom_smooth()

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Since we have several variables that may affect the price we may plot different graphs using facet_wrap(~cut)

colors <- rainbow(length(unique(diamonds$clarity)))
p <- ggplot(diamonds, aes(x = price, y = carat)) + 
        geom_point(aes(color = clarity), alpha = 0.5, size = 1) + 
        geom_smooth(color = "black") + 
        scale_colour_manual(values = colors, name = "Clarity") +
        facet_wrap(~cut) 
p

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Or we may change the size of the point

p <- ggplot(diamonds, aes(x = price, y = carat, size = cut)) + 
        geom_point(aes(color = clarity), alpha = 0.5) + 
        scale_colour_manual(values = colors, name = "Clarity")
p

Sampling the data to unclutter the plot

p <- ggplot(diamonds[sample(nrow(diamonds), size=500),], 
            aes(x = carat, y = price, size = cut)) + 
        geom_point(aes(color = clarity), alpha = 0.5) + 
        scale_colour_manual(values = colors, name = "Clarity")
p

7.2.1 Using Themes

theme_grey(), theme_classic(), theme_dark(), theme_minimal()

my_theme <- theme_bw()+
            theme(text = element_text(size = 18, family = "Times", face = "bold"),
                  axis.ticks = element_line(size = 1),
                  legend.text = element_text(size = 14, family = "Times"),
                  panel.border = element_rect(size = 2),
                  panel.grid.major = element_blank(), 
                  panel.grid.minor = element_blank()
                  )

Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.

Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.

p + my_theme

7.3 Interactivity with plotly

if (requireNamespace("plotly", quietly = TRUE)) {
  library(plotly)
  p <- ggplot(diamonds[sample(nrow(diamonds), size = 100),], 
          aes(x = carat, y = price)) + 
          geom_point(aes(color = clarity), alpha = 0.5, size = 2) + 
          my_theme
  ggplotly(p, dynamicTicks = TRUE)
} else {
  message("plotly is not installed; skipping interactive example.")
}

plotly is not installed; skipping interactive example.

Chapter 28 from R for Data Science

library(ggplot2)
data("mtcars") # from Base R  ?mtcars

hist(mtcars$mpg)

# create canvas
ggplot(mpg)

# variables of interest mapped
ggplot(mpg, aes(x = displ, y = hwy))

# data plotted
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point()

7.3.1 Labels, subtitles, captions

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  labs(title = "Fuel efficiency generally decreases with engine size")

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  labs(
    title = "Fuel efficiency generally decreases with engine size",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(colour = class)) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Engine displacement (L)",
    y = "Highway fuel economy (mpg)",
    colour = "Car type"
  )

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

df <- tibble(
  x = runif(10),
  y = runif(10)
)
ggplot(df, aes(x, y)) +
  geom_point() +
  labs(
    x = quote(sum(x[i] ^ 2, i == 1, n)),
    y = quote(alpha + beta + frac(delta, theta))
  )

7.3.2 Annotations

We may label individual observations or groups of observations with geom_text(), geom_label and using some transformation with the package ggrepel

best_in_class <- mpg %>%
  group_by(class) %>%
  filter(row_number(desc(hwy)) == 1)

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(colour = class)) +
  geom_text(aes(label = model), data = best_in_class)

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(colour = class)) +
  geom_label(aes(label = model), data = best_in_class, nudge_y = 2, alpha = 0.5)

ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(colour = class)) +
  geom_point(size = 3, shape = 1, data = best_in_class) +
  ggrepel::geom_label_repel(aes(label = model), data = best_in_class)

7.4 Further Information

The book ggplot2
Video Introduction to ggplot in R Youtube video
Ggplot2 gallery
R for Data Science (2e): Data Visualization
R for Data Science (2e): Communication
An introduction to ggplot2
Top 50 ggplot visualizations