Get started#

Load data#


To start playing around with the functions from these packages we will use the palmerpenguins data set. This simple data set has both continuous and categorical variables that make it perfect for showcasing how different functions work.


penguins_url = ""
dat = read_csv(url(penguins_url))
dat = dat %>% drop_na()

## # A tibble: 6 × 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <chr>   <chr>              <dbl>         <dbl>             <dbl>       <dbl>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           36.7          19.3               193        3450
## 5 Adelie  Torgersen           39.3          20.6               190        3650
## 6 Adelie  Torgersen           38.9          17.8               181        3625
## # ℹ 2 more variables: sex <chr>, year <dbl>

General plotting with ggpubr#


ggpubr allows to make insightful plots quickly for exploration that in turn can be further customized thanks to being built on top of ggplot2.

These are useful links for using this package:

Next, we will try to answer different questions using this library and ggplot2.

How many penguins of each species did we observe in total?#

ggpie(dat %>% count(species), x = "n", fill = "species")

How many penguins of each species and sex did we observe across the different islands?#

ggbarplot(dat %>% count(species, sex, island), x = "species", y = "n", fill = "sex", 
          label = TRUE, position = position_dodge(0.7), = "island", palette = "lancet")

What are the distributions of flipper lengths considering penguin species, sex and islands of origin?#

gghistogram(dat, x = "flipper_length_mm", fill = "sex", = c("species","island"))

Alternatively, we can use stripcharts charts:

ggstripchart(dat, x = "island", y = "flipper_length_mm", color = "sex", = "species", alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", add.params = list(color="black", group="sex", size=0.2))

Are the differences of body mass between sexes significant if we control for species and island?#

ggstripchart(dat, x = "island", y = "body_mass_g", color = "sex", = "species", alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", add.params = list(color="black", group="sex", size=0.2))+
   stat_compare_means(aes(color = sex), label = "p.signif", method = "wilcox.test")

What is the relationship between flipper length, body mass and bill length?#

ggscatter(dat, x = "flipper_length_mm", y = "body_mass_g", color = "bill_length_mm", alpha = 0.5)

Could we have sampling bias in the relationship between flipper length and body mass?#

ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", ellipse = TRUE)

What is the spearman correlation coefficient between body mass and flipper length?#

ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", 
          add = "reg.line", = TRUE, 
          cor.coef = TRUE,
          cor.coeff.args = list(method = "spearman", label.sep = "\n")) + 
   theme(aspect.ratio = 1)

Create and save a figure#

fontsize = 6
labsize = 2

# overview number of observations of every sex across islands and species
p1 = ggbarplot(dat %>% count(species, sex, island), x = "species", y = "n", fill = "sex", 
               label = TRUE, lab.size = labsize,
               position = position_dodge(0.7), = "island", palette = "lancet") + 
   ylim(NA, 68)

# sex-related body mass distributions across islands and species
p2 = ggstripchart(dat, x = "island", y = "body_mass_g", color = "sex", = "species", 
                  alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", 
                  add.params = list(color="black", group="sex", size=0.2),
                  palette = "lancet")+
            stat_compare_means(aes(color = sex), label = "p.signif", method = "wilcox.test", size = labsize)

# association of flipper length and body mass
p3 = ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", 
          add = "reg.line", = TRUE, 
          cor.coef = TRUE,
          cor.coeff.args = list(method = "spearman", label.sep = "\n", size = labsize)) + 
   theme(aspect.ratio = 1)

p1p2 = ggarrange(p1 + theme_pubr(base_size = fontsize), p2 + theme_pubr(base_size = fontsize), ncol = 1, common.legend = TRUE)
fig = ggarrange(p1p2, p3 + theme_pubr(base_size = fontsize), widths = c(2,1), heights = c(2, 1), labels = "AUTO")

# save
ggsave("images/myfig.png", fig, width = 15, height = 10, unit = "cm")

Heatmaps with ComplexHeatmap#

A part from ggpubr, one of the most common packages to visualize multiple types of data altogether is ComplexHeatmap, which allows to combine hierarchical clustering of rows and columns with continuous and categorical data.

# we are only interested in numeric columns
cols_oi = c("bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g")
rownames(dat) = 1:nrow(dat)

# we need to add because "dat" is a tibble,
# which differ in the way they handle data underlying data types
# we can customize the color for each species
colors_species = c("Adelie"="red", "Chinstrap"="yellow", "Gentoo"="grey")
colors_annot = list(species=colors_species)
annotation_row = HeatmapAnnotation(df=dat[,c("island","species")] %>%,
                                   col = colors_annot)

mat = dat[,cols_oi] %>% as.matrix()
mat = scale(mat)
        show_row_names = FALSE, 
        right_annotation = annotation_row)

