Mean median mode and variance¶

Statistical Laboratory

Alessandro Ortis - University of Catania

# Create a vector. 
x <- c(12,70,21,4.2,1.8,12,5.4,-2.1,-8,-5)

# Find Mean
res <- mean(x)
print(res)
# Find Median
res <- median(x)
print(res)

[1] 11.13
[1] 4.8

# Addressing with NA values
x <- c(12,70,21,4.2,1.8,12,5.4,-2.1,-8,-5,NA, NA, NA)
res <-  mean(x)
print(res)
res <-  mean(x,na.rm = TRUE)
print(res)

[1] NA
[1] 11.13

Question:¶

How has the mean value been computed in the second case?

1) NA is considered as zero

2) NA is ignored

x <- c(-50,2,3,2,1,4,2,3,3)
res <- mean(x)
print(res)
res <- median(x)
print(res)

[1] -3.333333
[1] 2

Question:¶

Why the mean and the median values are so different in both the previous two examples?

Trimmed mean¶

When trim parameter is supplied, the values in the vector get sorted and then the required numbers of observations are dropped from calculating the mean. When trim = 0.25, the 25% of values from each end will be dropped from the calculations to find mean.

# When trim=0.25 it's also called interquartile mean
res <- mean(x, trim = 0.25)
print(res)
res <- median(x)
print(res)

[1] 2.4
[1] 2

Question:¶

Why the trimmed mean and the median are more similar now?

sort(x)

Variance, stddev, quartiles and percentiles¶

std_d <- sd(x)
print(std_d)
v <- var(x)
print(v)

print(sqrt(v))
print(std_d^2)

[1] 17.52142
[1] 307
[1] 17.52142
[1] 307

?sort

x = c(12, 31, 20, 12, 10, 50, 65, 23, 33, 10)
print(x)
quantile(x)

 [1] 12 31 20 12 10 50 65 23 33 10

q_res = quantile(x)
print(q_res)
q_res["25%"]

  0%  25%  50%  75% 100% 
 -50    2    2    3    4

sort(x) # ascending order

# We can apply the quantile function 
# to compute the percentiles with the 
# desired percentage ratios.
quantile(x, 0.25)
quantile(x, 0.50)
quantile(x, 0.67)

q_67 = quantile(x,0.67)
names(q_67)

Boxplots¶

Boxplots are useful data visualization tools to observe how the data are distributed.

print(names(mtcars))
head(mtcars)

 [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
[11] "carb"

sub_data <- mtcars[,c('mpg','cyl')]
print(head(sub_data))

                   mpg cyl
Mazda RX4         21.0   6
Mazda RX4 Wag     21.0   6
Datsun 710        22.8   4
Hornet 4 Drive    21.4   6
Hornet Sportabout 18.7   8
Valiant           18.1   6

# '~' == ALT+126 (ALT+n for Mac users)
# How the mpg variates depending on the vs?
boxplot(mpg ~ vs, 
        data = mtcars, 
        xlab = "Type of engine", 
        ylab= "Miles per gallon", 
        main = "mpg distributions")

?boxplot

boxplot(mpg ~ cyl, 
        data = mtcars, 
        xlab = "Num. of cylinders", 
        ylab="Miles per gallon", 
        main = "mpg distributions")

More on mean and median¶

set.seed(123)
# Get a set of random Gaussian data with mean = 10 and sigma = 5
data = rnorm(mean=10, sd=5,n = 1000)

# Compute the mean and the median
mean(data)
median(data)

As we expected, mean and median have similar values.

density_plot <- function(X){
    x2 = seq(min(X),max(X),length=40)
    fun= dnorm(x2,mean=mean(X),sd = sd(X))
    hist(X,prob=TRUE,
         col="white",
        ylim = c(0,max(fun)),
        main = "Histogram")
    lines(x2,fun,col=2,lwd=2)
}

# Show the data distribution
density_plot(data)
hist(data)
boxplot(data, main="Boxplot of data")

Now add a few noise values, i.e. values belonging to another distribution.

noise = rnorm(mean = 1500, sd= 5, n = 50)
hist(noise)

noisy_data = c(data, noise)

mean(noisy_data)
median(noisy_data)

Now the median and the mean values are very different, although we perturbed only the 4% of the population (50 new values of 1050).

hist(noisy_data)
boxplot(noisy_data, main = "Boxplot of noisy data")
density_plot(noisy_data)

This simple experiment shows how even a few number of noisy data can affect the mean values, in these cases the median can be more reliable.

Exercises¶

Create a set of random data, then add some noise considering both tails and:

1) Clean the data, considering only the items within the interval [Q1,Q3]

2) Clean the data, considering only the items within the interval [Q1-1.5ID, Q3+1.5ID], where ID = Q3-Q1.

#Solution (proposed by class)

set.seed(123)
data = rnorm(mean=10,sd=5,n=1000)
noise_1 = rnorm(mean=-1000, sd=5, n=50)
noise_2 = noise_1 + 2000

d = c(data, noise_1, noise_2)

q = quantile(d)
q1 = q[2]  #  q1 = quantile(d,0.25)
q3 = q[4]  #  q3 = quantile(d,0.75)

k = (q3-q1)*1.5

c1 = d[d>q1 & d<q3] # 1)
c2 = d[d>(q1-k) & d<(q3+k)] # 2)

hist(d)
hist(c1)
hist(c2)

# Create a set of random data
my_data = rnorm(1000, mean=10, sd = 1)
# Add some noise considering both tiles
noise = rnorm(50, mean= 23, sd = 100)
my_data = c(my_data, noise)

boxplot(my_data)

# 1) Clean the data, considering only the items within the interval [Q1,Q3]
q1 = quantile(my_data, 0.25)
q1
q3 = quantile(my_data, 0.75)
q3

# get only the values (without the descriptive labels)
q1 = unname(q1)
q1
q3 = unname(q3)
q3

clean_data = c()
for (x in my_data){
    if (x>=q1 && x<=q3){
        clean_data = c(clean_data, x)
    }
}

boxplot(clean_data)

# 2) Clean the data, considering only the items within 
# the interval [Q1-1.5ID, Q3+1.5ID], where ID = Q3-Q1.

ID = q3-q1
ID

low_th = q1-1.5*ID
high_th = q3+1.5*ID

low_th
high_th

clean_data = c()
for (x in my_data){
    if (x>=low_th && x<=high_th){
        clean_data = c(clean_data, x)
    }
}

boxplot(clean_data)

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
Mazda RX4	21.0	6	160	110	3.90	2.620	16.46	0	1	4	4
Mazda RX4 Wag	21.0	6	160	110	3.90	2.875	17.02	0	1	4	4
Datsun 710	22.8	4	108	93	3.85	2.320	18.61	1	1	4	1
Hornet 4 Drive	21.4	6	258	110	3.08	3.215	19.44	1	0	3	1
Hornet Sportabout	18.7	8	360	175	3.15	3.440	17.02	0	0	3	2
Valiant	18.1	6	225	105	2.76	3.460	20.22	1	0	3	1