# Create a vector.
x <- c(12,70,21,4.2,1.8,12,5.4,-2.1,-8,-5)
# Find Mean
res <- mean(x)
print(res)
# Find Median
res <- median(x)
print(res)
# Addressing with NA values
x <- c(12,70,21,4.2,1.8,12,5.4,-2.1,-8,-5,NA, NA, NA)
res <- mean(x)
print(res)
res <- mean(x,na.rm = TRUE)
print(res)
How has the mean value been computed in the second case?
1) NA is considered as zero
2) NA is ignored
x <- c(-50,2,3,2,1,4,2,3,3)
res <- mean(x)
print(res)
res <- median(x)
print(res)
Why the mean and the median values are so different in both the previous two examples?
When trim parameter is supplied, the values in the vector get sorted and then the required numbers of observations are dropped from calculating the mean. When trim = 0.25, the 25% of values from each end will be dropped from the calculations to find mean.
# When trim=0.25 it's also called interquartile mean
res <- mean(x, trim = 0.25)
print(res)
res <- median(x)
print(res)
Why the trimmed mean and the median are more similar now?
sort(x)
std_d <- sd(x)
print(std_d)
v <- var(x)
print(v)
print(sqrt(v))
print(std_d^2)
?sort
x = c(12, 31, 20, 12, 10, 50, 65, 23, 33, 10)
print(x)
quantile(x)
q_res = quantile(x)
print(q_res)
q_res["25%"]
sort(x) # ascending order
# We can apply the quantile function
# to compute the percentiles with the
# desired percentage ratios.
quantile(x, 0.25)
quantile(x, 0.50)
quantile(x, 0.67)
q_67 = quantile(x,0.67)
names(q_67)
Boxplots are useful data visualization tools to observe how the data are distributed.
print(names(mtcars))
head(mtcars)
sub_data <- mtcars[,c('mpg','cyl')]
print(head(sub_data))
# '~' == ALT+126 (ALT+n for Mac users)
# How the mpg variates depending on the vs?
boxplot(mpg ~ vs,
data = mtcars,
xlab = "Type of engine",
ylab= "Miles per gallon",
main = "mpg distributions")
?boxplot
boxplot(mpg ~ cyl,
data = mtcars,
xlab = "Num. of cylinders",
ylab="Miles per gallon",
main = "mpg distributions")
set.seed(123)
# Get a set of random Gaussian data with mean = 10 and sigma = 5
data = rnorm(mean=10, sd=5,n = 1000)
# Compute the mean and the median
mean(data)
median(data)
As we expected, mean and median have similar values.
density_plot <- function(X){
x2 = seq(min(X),max(X),length=40)
fun= dnorm(x2,mean=mean(X),sd = sd(X))
hist(X,prob=TRUE,
col="white",
ylim = c(0,max(fun)),
main = "Histogram")
lines(x2,fun,col=2,lwd=2)
}
# Show the data distribution
density_plot(data)
hist(data)
boxplot(data, main="Boxplot of data")
Now add a few noise values, i.e. values belonging to another distribution.
noise = rnorm(mean = 1500, sd= 5, n = 50)
hist(noise)
noisy_data = c(data, noise)
mean(noisy_data)
median(noisy_data)
Now the median and the mean values are very different, although we perturbed only the 4% of the population (50 new values of 1050).
hist(noisy_data)
boxplot(noisy_data, main = "Boxplot of noisy data")
density_plot(noisy_data)
This simple experiment shows how even a few number of noisy data can affect the mean values, in these cases the median can be more reliable.
Create a set of random data, then add some noise considering both tails and:
1) Clean the data, considering only the items within the interval [Q1,Q3]
2) Clean the data, considering only the items within the interval [Q1-1.5ID, Q3+1.5ID], where ID = Q3-Q1.
#Solution (proposed by class)
set.seed(123)
data = rnorm(mean=10,sd=5,n=1000)
noise_1 = rnorm(mean=-1000, sd=5, n=50)
noise_2 = noise_1 + 2000
d = c(data, noise_1, noise_2)
q = quantile(d)
q1 = q[2] # q1 = quantile(d,0.25)
q3 = q[4] # q3 = quantile(d,0.75)
k = (q3-q1)*1.5
c1 = d[d>q1 & d<q3] # 1)
c2 = d[d>(q1-k) & d<(q3+k)] # 2)
hist(d)
hist(c1)
hist(c2)
# Create a set of random data
my_data = rnorm(1000, mean=10, sd = 1)
# Add some noise considering both tiles
noise = rnorm(50, mean= 23, sd = 100)
my_data = c(my_data, noise)
boxplot(my_data)
# 1) Clean the data, considering only the items within the interval [Q1,Q3]
q1 = quantile(my_data, 0.25)
q1
q3 = quantile(my_data, 0.75)
q3
# get only the values (without the descriptive labels)
q1 = unname(q1)
q1
q3 = unname(q3)
q3
clean_data = c()
for (x in my_data){
if (x>=q1 && x<=q3){
clean_data = c(clean_data, x)
}
}
boxplot(clean_data)
# 2) Clean the data, considering only the items within
# the interval [Q1-1.5ID, Q3+1.5ID], where ID = Q3-Q1.
ID = q3-q1
ID
low_th = q1-1.5*ID
high_th = q3+1.5*ID
low_th
high_th
clean_data = c()
for (x in my_data){
if (x>=low_th && x<=high_th){
clean_data = c(clean_data, x)
}
}
boxplot(clean_data)