R Programming (EDA 2)

Adapted from R4DS Chapter 7: Exploratory Data Analysis

Prerequisite: use preload function to get tidyverse and descr load them

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(descr) # Describe attributes of objects/variables
## Warning: package 'descr' was built under R version 4.0.3

Explore the diamonds dataset

?diamonds
## starting httpd help server ... done

Examine the variables using the class function

class(cut)
## [1] "function"
sapply(diamonds,class)
## $carat
## [1] "numeric"
## 
## $cut
## [1] "ordered" "factor" 
## 
## $color
## [1] "ordered" "factor" 
## 
## $clarity
## [1] "ordered" "factor" 
## 
## $depth
## [1] "numeric"
## 
## $table
## [1] "numeric"
## 
## $price
## [1] "integer"
## 
## $x
## [1] "numeric"
## 
## $y
## [1] "numeric"
## 
## $z
## [1] "numeric"

Frequency table, alternative method

with(diamonds, {freq(cut, plot=T)})

## cut 
##           Frequency Percent Cum Percent
## Fair           1610   2.985       2.985
## Good           4906   9.095      12.080
## Very Good     12082  22.399      34.479
## Premium       13791  25.567      60.046
## Ideal         21551  39.954     100.000
## Total         53940 100.000

Create tibble object out of diamonds dataset

diam_tb=as_tibble(diamonds)
class(diam_tb) # How is this different from a data frame?
## [1] "tbl_df"     "tbl"        "data.frame"

Plot the bar chart using ggplot2

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut)) 

Better plot with percentage on y axis

ggplot(data = diamonds,aes(cut)) +
  geom_bar(mapping = aes(y = (..count..)/sum(..count..))) + theme_bw() +
  scale_y_continuous(labels=scales::percent) +
  ylab("Percent")

Exercise 1

Save the chart into PNG, PDF and SVG formats

What are the differences among all these formats?

setwd("C:/Users/feder/Desktop")
pdf("barplot.pdf")
ggplot(data = diamonds,aes(cut)) +
  geom_bar(mapping = aes(y = (..count..)/sum(..count..))) + theme_bw() +
  scale_y_continuous(labels=scales::percent) +
  ylab("Percent")
dev.off()
## png 
##   2
png("barplot.png")
ggplot(data = diamonds,aes(cut)) +
  geom_bar(mapping = aes(y = (..count..)/sum(..count..))) + theme_bw() +
  scale_y_continuous(labels=scales::percent) +
  ylab("Percent")
dev.off()
## png 
##   2
svg("barplot.svg")
ggplot(data = diamonds,aes(cut)) +
  geom_bar(mapping = aes(y = (..count..)/sum(..count..))) + theme_bw() +
  scale_y_continuous(labels=scales::percent) +
  ylab("Percent")
dev.off()
## png 
##   2

Plot another variable carat

?diamonds
ggplot(data = diam_tb) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

Exercise 2

Can you improve the visualization of this chart?

What is the difference between barchart and histogram?

ggplot(data = diam_tb) +
  geom_histogram (mapping = aes(x = carat), fill="#FF6666", binwidth = 0.03) +
  labs(title="Quantity of diamonds according to weight",x="Diamond Weight", y = "Count")+
  theme_classic()

Subsetting the dataset

smaller <- diamonds %>% 
  filter(carat < 3)

Histogram of smaller dataset

ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)

Polygon

ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1) + theme_bw() 

Exercise 3

Can you change colors?

Hint: use the RcolorBrewer package

library(RColorBrewer)

Continuous variables

ggplot(data = faithful) + 
  geom_point(mapping = aes(x = eruptions, y = waiting), color='purple')+
  labs(title="Eruptions by time of waiting",x="Eruptions", y = "Waiting")+
  theme_classic()

Exercise 4

What method you will use to analyze:

Dependent variable: continuous, Independent variable: discrete ANOVA

Dependent variable: discrete, Independent variable: continuous LOGISTIC REGRESSION

Dependent variable: continuous, Independent variable: continuous LINEAR REGRESSION