Adapted from R4DS Chapter 7: Exploratory Data Analysis
Prerequisite: use preload function to get tidyverse and descr load them
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(descr) # Describe attributes of objects/variables
## Warning: package 'descr' was built under R version 4.0.3
Explore the diamonds dataset
?diamonds
## starting httpd help server ... done
Examine the variables using the class function
class(cut)
## [1] "function"
sapply(diamonds,class)
## $carat
## [1] "numeric"
##
## $cut
## [1] "ordered" "factor"
##
## $color
## [1] "ordered" "factor"
##
## $clarity
## [1] "ordered" "factor"
##
## $depth
## [1] "numeric"
##
## $table
## [1] "numeric"
##
## $price
## [1] "integer"
##
## $x
## [1] "numeric"
##
## $y
## [1] "numeric"
##
## $z
## [1] "numeric"
Frequency table, alternative method
with(diamonds, {freq(cut, plot=T)})
## cut
## Frequency Percent Cum Percent
## Fair 1610 2.985 2.985
## Good 4906 9.095 12.080
## Very Good 12082 22.399 34.479
## Premium 13791 25.567 60.046
## Ideal 21551 39.954 100.000
## Total 53940 100.000
Create tibble object out of diamonds dataset
diam_tb=as_tibble(diamonds)
class(diam_tb) # How is this different from a data frame?
## [1] "tbl_df" "tbl" "data.frame"
Plot the bar chart using ggplot2
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
Better plot with percentage on y axis
ggplot(data = diamonds,aes(cut)) +
geom_bar(mapping = aes(y = (..count..)/sum(..count..))) + theme_bw() +
scale_y_continuous(labels=scales::percent) +
ylab("Percent")
Exercise 1
Save the chart into PNG, PDF and SVG formats
Plot another variable carat
?diamonds
ggplot(data = diam_tb) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
Exercise 2
Can you improve the visualization of this chart?
What is the difference between barchart and histogram?
ggplot(data = diam_tb) +
geom_histogram (mapping = aes(x = carat), fill="#FF6666", binwidth = 0.03) +
labs(title="Quantity of diamonds according to weight",x="Diamond Weight", y = "Count")+
theme_classic()
Subsetting the dataset
smaller <- diamonds %>%
filter(carat < 3)
Histogram of smaller dataset
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
Polygon
ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1) + theme_bw()
Exercise 3
Can you change colors?
Hint: use the RcolorBrewer package
library(RColorBrewer)
Continuous variables
ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting), color='purple')+
labs(title="Eruptions by time of waiting",x="Eruptions", y = "Waiting")+
theme_classic()
Exercise 4
What method you will use to analyze:
Dependent variable: continuous, Independent variable: discrete ANOVA
Dependent variable: discrete, Independent variable: continuous LOGISTIC REGRESSION
Dependent variable: continuous, Independent variable: continuous LINEAR REGRESSION