How to describe a statistical population using R - Part 2: Distribution
Jesper Martinsson
From Oceans to Dashboards: Marine Ecologist | Data Wrangler | BI Leader
Besides Location and variability you can also use the distribution as a way to describe your data.
Frequencies and proportions, along with graphical representations like bar plots and histograms, provide a visual overview of your entire dataset. This is helpful in understanding the distribution of the observed values.
Throughout this article we use an example where heights have been randomly measured of 100 oak trees in a area:
Frequencies
The array of oak tree heights above really does not give any feeling of how the sample is distributed.
One way of getting a more visual understanding of the distribution of the values is to make a frequency table.
You can create a frequency table in two ways; tallies and counts.
Tallies
A tally represents an observation of a specific height:
The use of tallies to create a frequency table provides an overview of the data, offering insights into its distribution. However, this approach is time-consuming and thus not particularly practical. Interestingly, it resemblances a bar plot, which we will create shortly. Before that, we will proceed to illustrate the frequency table using numerical values instead of tallies.
Counts
The number of each height is presented:
If you go on and sum these values you’ll find that it adds up to 100. That means all the observations are accounted for. Now we can go on making a plot describing the sample:
Do it in r
install.packages("ggplot2")
library(ggplot2)
# Frequencies
? y = c(15, 16, 17, 16, 19, 19, 17, 17, 17, 10,
? ? ? ? 17, 17, 12, 17, 12, 12, 17, 17, 17, 13,
? ? ? ? 17, 17, 13, 17, 14, 14, 17, 17, 17, 15,
? ? ? ? 17, 17, 15, 17, 15, 16, 17, 17, 17, 16,
? ? ? ? 17, 17, 16, 17, 16, 16, 17, 17, 17, 18,
? ? ? ? 17, 17, 18, 17, 18, 18, 17, 17, 17, 19,
? ? ? ? 17, 17, 10, 18, 10, 10, 18, 18, 18, 11,
? ? ? ? 18, 18, 11, 18, 12, 12, 18, 18, 18, 12,
? ? ? ? 18, 18, 13, 18, 14, 14, 18, 18, 18, 15,
? ? ? ? 18, 18, 16, 18, 17, 18, 18, 18, 18, 10
? )
? my_counts = table(y)
? count_df = as.data.frame(my_counts)
??
? # Plot the distribution using bar chart
? font_family = 'sans'
??
? p<-ggplot(data=count_df, aes(x=y, y=Freq)) +
? ? geom_bar(stat="identity",width = 0.5,fill = 'coral') +
? ? scale_y_continuous(name = 'Frequency', limits=c(0,max(count_df$Freq)+0.5),expand = c(0,0)) +
? ? theme(axis.title.x=element_blank(),
? ? ? ? ? axis.text.x=element_text(size=12,family=font_family),
? ? ? ? ? axis.text.y = element_text(size=12),
? ? ? ? ? axis.title.y=element_text(margin=margin(0,20,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? plot.title = element_text(size = 16, face = "bold", family=font_family),
? ? ? ? ? plot.margin = unit(c(1, 5, 1, 1), "lines")
? ? )
p
Proportions
The data can also be represented using proportions. When you divide the frequency for each height by the total number of units in your sample or population, you obtain the proportion corresponding to that height. In other words, this proportion reflects the portion of space occupied by that particular height in the dataset. Additionally, this proportion also signifies the probability of encountering a tree with that specific height in the given area.
Subsequently, we proceed to calculate the proportion for each height by dividing the count for that height by 100, which is the total number (n) of units in the sample:
?Be sure everything is correct by summing all proportions; they should add up to 1 exactly.
The plot for proportions looks as follows:
领英推荐
Do it in r
library(ggplot2)
# Proportions
? y = c(15, 16, 17, 16, 19, 19, 17, 17, 17, 10,
? ? ? ? 17, 17, 12, 17, 12, 12, 17, 17, 17, 13,
? ? ? ? 17, 17, 13, 17, 14, 14, 17, 17, 17, 15,
? ? ? ? 17, 17, 15, 17, 15, 16, 17, 17, 17, 16,
? ? ? ? 17, 17, 16, 17, 16, 16, 17, 17, 17, 18,
? ? ? ? 17, 17, 18, 17, 18, 18, 17, 17, 17, 19,
? ? ? ? 17, 17, 10, 18, 10, 10, 18, 18, 18, 11,
? ? ? ? 18, 18, 11, 18, 12, 12, 18, 18, 18, 12,
? ? ? ? 18, 18, 13, 18, 14, 14, 18, 18, 18, 15,
? ? ? ? 18, 18, 16, 18, 17, 18, 18, 18, 18, 10
? )
? my_counts = table(y)?
proportions = my_counts/sum(my_counts)
? prop_df = as.data.frame(proportions)
??
? font_family = 'sans'
? p<-ggplot(data=prop_df, aes(x=y, y=Freq)) +
? ? geom_bar(stat="identity",width = 0.5,fill = 'coral') +
? ? scale_y_continuous(name = 'Proportion', limits=c(0,max(prop_df$Freq)+0.1),expand = c(0,0)) +
? ? theme(axis.title.x=element_blank(),
? ? ? ? ? axis.text.x=element_text(size=12,family=font_family),
? ? ? ? ? axis.text.y = element_text(size=12),
? ? ? ? ? axis.title.y=element_text(margin=margin(0,20,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? plot.title = element_text(size = 16, face = "bold", family=font_family),
? ? ? ? ? plot.margin = unit(c(1, 5, 1, 1), "lines")
? ? )?
The class interval
When plotting observation values on the x-axis, it's not always practical to show each individual value. There might be numerous unique values, and in many cases, certain values might have only one observation.
To address this issue, we can group the values into intervals. For instance, we can group heights into intervals of three units each. For example, a height of 11 meters would fall into the interval 10-12. This range is referred to as the 'class interval,' and the specific height value (in this case, 11) is known as the 'class mark.'
We then proceed to aggregate the frequencies of all heights within that interval and present this information beside the corresponding class interval:
Smoother, hey? This table does not take up as much space as the former and present a smoother distribution? of the values. How does this looks like in the plot? Wait a minute. A barplot made on these class intervals is actually called a histogram:?
Do it in r
library(ggplot2)
# Class intervals?
y = c(15, 16, 17, 16, 19, 19, 17, 17, 17, 10,
? ? ? ? 17, 17, 12, 17, 12, 12, 17, 17, 17, 13,
? ? ? ? 17, 17, 13, 17, 14, 14, 17, 17, 17, 15,
? ? ? ? 17, 17, 15, 17, 15, 16, 17, 17, 17, 16,
? ? ? ? 17, 17, 16, 17, 16, 16, 17, 17, 17, 18,
? ? ? ? 17, 17, 18, 17, 18, 18, 17, 17, 17, 19,
? ? ? ? 17, 17, 10, 18, 10, 10, 18, 18, 18, 11,
? ? ? ? 18, 18, 11, 18, 12, 12, 18, 18, 18, 12,
? ? ? ? 18, 18, 13, 18, 14, 14, 18, 18, 18, 15,
? ? ? ? 18, 18, 16, 18, 17, 18, 18, 18, 18, 10
? )?
class_intervals = as.data.frame(table(cut(y,seq(min(y),max(y),2))))
? add_last_interval = rbind(class_intervals,data.frame(Var1='(19,19]',Freq=3))
??
? # Remove unwanted characters
??
? class_interval_remove_parantheses = gsub('[()]','',add_last_interval$Var1)
? class_interval_remove_brackets = gsub('\\[|\\]','',class_interval_remove_parantheses)
? add_last_interval$Var1<-class_interval_remove_brackets
? # Create plot
font_family = 'sans'?
??
? ggplot(data=add_last_interval, aes(x=Var1, y=Freq)) +
? ? geom_bar(stat="identity",width = 0.5,fill = 'coral') +
? ? scale_y_continuous(name = 'Frequency', limits=c(0,max(add_last_interval$Freq)+1),expand = c(0,0)) +
? ? scale_x_discrete("Class interval") +
? ? theme(axis.title.x=element_text(margin=margin(20,0,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? axis.text.x=element_text(size=12,family=font_family),
? ? ? ? ? axis.text.y = element_text(size=12),
? ? ? ? ? axis.title.y=element_text(margin=margin(0,20,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? plot.title = element_text(size = 16, face = "bold", family=font_family),
? ? ? ? ? plot.margin = unit(c(1, 5, 1, 1), "lines")
? ? )
Additional methods to create histograms in r
geom_histogram
?# Histogram with geom_histogram
y = c(15, 16, 17, 16, 19, 19, 17, 17, 17, 10,
? ? ? ? 17, 17, 12, 17, 12, 12, 17, 17, 17, 13,
? ? ? ? 17, 17, 13, 17, 14, 14, 17, 17, 17, 15,
? ? ? ? 17, 17, 15, 17, 15, 16, 17, 17, 17, 16,
? ? ? ? 17, 17, 16, 17, 16, 16, 17, 17, 17, 18,
? ? ? ? 17, 17, 18, 17, 18, 18, 17, 17, 17, 19,
? ? ? ? 17, 17, 10, 18, 10, 10, 18, 18, 18, 11,
? ? ? ? 18, 18, 11, 18, 12, 12, 18, 18, 18, 12,
? ? ? ? 18, 18, 13, 18, 14, 14, 18, 18, 18, 15,
? ? ? ? 18, 18, 16, 18, 17, 18, 18, 18, 18, 10
? )?
??
? hist_df=data.frame(y=y)
??
? font_family = 'sans'
? ggplot(data=hist_df, aes(x=y)) +
? ? geom_histogram(bins=3,fill = 'coral')+
? ? scale_y_continuous(name = 'Count',expand = c(0,0)) +
? ? xlab("Height(m)") +
? ? theme(axis.title.x=element_text(margin=margin(20,0,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? axis.text.x=element_text(size=12,family=font_family),
? ? ? ? ? axis.text.y = element_text(size=12),
? ? ? ? ? axis.title.y=element_text(margin=margin(0,20,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? plot.title = element_text(size = 16, face = "bold", family=font_family),
? ? ? ? ? plot.margin = unit(c(1, 5, 1, 1), "lines")
? ? )? ??
hist() - function
# Histogram with hist function
y = c(15, 16, 17, 16, 19, 19, 17, 17, 17, 10,
? ? ? ? 17, 17, 12, 17, 12, 12, 17, 17, 17, 13,
? ? ? ? 17, 17, 13, 17, 14, 14, 17, 17, 17, 15,
? ? ? ? 17, 17, 15, 17, 15, 16, 17, 17, 17, 16,
? ? ? ? 17, 17, 16, 17, 16, 16, 17, 17, 17, 18,
? ? ? ? 17, 17, 18, 17, 18, 18, 17, 17, 17, 19,
? ? ? ? 17, 17, 10, 18, 10, 10, 18, 18, 18, 11,
? ? ? ? 18, 18, 11, 18, 12, 12, 18, 18, 18, 12,
? ? ? ? 18, 18, 13, 18, 14, 14, 18, 18, 18, 15,
? ? ? ? 18, 18, 16, 18, 17, 18, 18, 18, 18, 10
? )?
? xrange = c(seq(min(y),max(y),2))
? hist(y,?
? ? ? ?breaks = "Sturges",
? ? ? ?freq = FALSE,
? ? ? ?col = "coral",?
? ? ? ?main = NULL,
? ? ? ?ylim = c(0,1),
? ? ? ?xlim= c(min(y),max(y)),?
? ? ? ?xlab = "Oak tree heights (m)",?
? ? ? ?ylab="Probability",
? ? ? ?bty="l",?
? ? ? ?las=1,?
? ? ? ?xaxt="n",
? ? ? ?cex.lab=1.2)
??
? axis(side=1,
? ? ? ?at=xrange,
? ? ? ?labels = xrange,
? ? ? ?pos=0,
? ? ? ?las=1,
? ? ? ?tick=T)n
Cover photo: Histogram using 10 000
Histogram using 10 000 normally distributed values using the mean and standard deviation of the examples above
Do it in r
# Use rnorm() and geom_hist(
??
? y = round(rnorm(10000,mean(y),sd(y)),0)
? my_counts = table(y)
? count_df = as.data.frame(my_counts)
? proportions = my_counts/sum(my_counts)
? prop_df = as.data.frame(proportions)
??
? font_family = 'sans'
? p<-ggplot(data=prop_df, aes(x=y, y=Freq)) +
? ? geom_bar(stat="identity",width = 0.5,fill = 'coral') +
? ? scale_y_continuous(name = 'Proportion', limits=c(0,max(prop_df$Freq)),expand = c(0,0)) +
? ? theme(axis.title.x=element_blank(),
? ? ? ? ? axis.text.x=element_blank(),
? ? ? ? ? axis.text.y = element_text(size=12),
? ? ? ? ? axis.ticks.x=element_blank(),
? ? ? ? ? axis.title.y=element_text(margin=margin(0,20,0,0),size=16, family=font_family, face='bold'),
? ? ? ? ? plot.title = element_text(size = 16, face = "bold", family=font_family),
? ? ? ? ? plot.margin = unit(c(1, 5, 1, 1), "lines")
? ? ))
p
Key takeaways: