diff --git a/Exploring_bodyfat.R b/Exploring_bodyfat.R new file mode 100644 index 0000000000000000000000000000000000000000..9ebe30f04308efc4caf389b5684a071c9328f07f --- /dev/null +++ b/Exploring_bodyfat.R @@ -0,0 +1,204 @@ +download.file("https://git.embl.de/hsanchez/BTM2016_RDatasets/raw/master/bodyfat.csv", "bodyfat.csv") + +bodyfat<-read.csv("bodyfat.csv") + +head(bodyfat) + +# Names of the columns +names(bodyfat) + +# Dimensions +dim(bodyfat) + +# Checking only one column, try the TAB key +bodyfat$age +head( bodyfat[ ,4] ) +head( bodyfat[ ,4], 12 ) + +# Numeric values describing age +summary(bodyfat$age) + +# Getting the mean of the age values +summary(bodyfat$age)[4] +mean(bodyfat$age) + + + +# How many are younger than 31 years old? +younger_than_31 <- bodyfat[ (bodyfat$age < 31), ] +length(younger_than_31) + +# Another way +not_older_than_30 <- bodyfat[ !(bodyfat$age > 30), ] +length(not_older_than_30) + +# Watch out! +head( bodyfat[ !(bodyfat$age > 30), ] ) +head( bodyfat[ !which(bodyfat$age > 30), ] ) +## not equivalent! + + + +number_of_columns <- length(names(bodyfat)) +number_of_columns <- dim(bodyfat)[2] +my_vector <- vector(length = number_of_columns) +for( i in 1:length( names(bodyfat) ) ){ + one_column <- bodyfat[ ,i] + the_mean <- mean(one_column) + my_vector[i] <- the_mean +} +names(my_vector) <- names(bodyfat) + +my_vector + +apply(bodyfat, MARGIN = 2, FUN = mean) + +usingFor <- function(){ + my_vector <- vector(length = number_of_columns) + for( i in 1:length( names(bodyfat) ) ){ + one_column <- bodyfat[ ,i] + the_mean <- mean(one_column) + my_vector[i] <- the_mean + } + names(my_vector) <- names(bodyfat) +} +usingApply <- function(){ + temp <- apply(bodyfat, MARGIN = 2, FUN = mean) +} + +system.time(replicate(1000000, usingFor)) +system.time(replicate(1000000, usingApply)) + +library(pryr) +mem_change(v <- 1:1e6) +mem_change(rm(v)) +try(rm(new_variable)) +mem_change(new_variable <- bodyfat$age) + + +# Going back! + +apply(bodyfat, MARGIN = 2, FUN = mean) + +Subset_1_indexes<-which(bodyfat$age < 31) +Subset_1<-bodyfat[Subset_1_indexes, ] + +# Getting the 30+ +temp_indexes<-which(bodyfat$age > 30) +Subset30plus<-bodyfat[temp_indexes, ] + +Subset_2_indexes<-which(Subset30plus$age < 51) +Subset_2<-Subset30plus[Subset_2_indexes, ] + +Subset_3_indexes<-which(Subset30plus$age > 50) +Subset_3<-Subset30plus[Subset_3_indexes, ] + +summary(Subset_2$age) +summary(Subset_3$age) + +# Finally, getting the weight per Subset: +mean(Subset_1$percent.fat) +mean(Subset_2$percent.fat) +mean(Subset_3$percent.fat) + +# Young people +mean( bodyfat[ bodyfat$age < 31, ]$percent.fat ) +# Not so young people +mean( bodyfat[ bodyfat$age > 30 & bodyfat$age < 51, ]$percent.fat ) +# Definitely not young people +mean( bodyfat[ bodyfat$age > 50, ]$percent.fat ) + +Option1<-function(){ + Subset_1_indexes<-which(bodyfat$age < 31) + Subset_1<-bodyfat[Subset_1_indexes, ] + temp_indexes<-which(bodyfat$age > 30) + Subset30plus<-bodyfat[temp_indexes, ] + Subset_2_indexes<-which(Subset30plus$age < 51) + Subset_2<-Subset30plus[Subset_2_indexes, ] + Subset_3_indexes<-which(Subset30plus$age > 50) + Subset_3<-Subset30plus[Subset_3_indexes, ] + summary(Subset_2$age) + summary(Subset_3$age) + mean(Subset_1$percent.fat) + mean(Subset_2$percent.fat) + mean(Subset_3$percent.fat) +} + +Option2<-function(){ + mean( bodyfat[ bodyfat$age < 31, ]$percent.fat ) + mean( bodyfat[ bodyfat$age > 30 & bodyfat$age < 51, ]$percent.fat ) + mean( bodyfat[ bodyfat$age > 50, ]$percent.fat ) +} + +system.time(replicate(1000000, Option1)) +system.time(replicate(1000000, Option2)) + +# Which columns are not the index, nor the age? +I_do_not_want_these_ones<-which(names(bodyfat)==c("X", "age")) +names(bodyfat[-I_do_not_want_these_ones]) + +# I calculate the pearson correlations of age and all the other values +sapply(bodyfat[-I_do_not_want_these_ones], function(x){ + cor(bodyfat$age, x) +}) + + +cor(bodyfat$percent.fat, bodyfat$density) + +plot(bodyfat$age, bodyfat$percent.fat, main="Age vs Percent fat") +plot(bodyfat$age, bodyfat$density, main="Age vs Density") +plot(bodyfat$percent.fat, bodyfat$density, main="Percent fat vs Density") + +pairs(bodyfat) + +pairs(bodyfat[ , 1:6]) + +message("Your code is talking to you") + +#for(i in c(2, 1, 0, -1)){ +# sqrt(i) +#} + +for(i in c(2, 1, 0, -1)){ + if(i >= 0){ + sqrt(i) + print(i) + } +} + +# Trying to calculate the mean of the columns 4th, 3rd, and 17th +# Unknown vector +UnknownVector <- c(c(4, 3, 17)) +#sapply( UnknonwVector, function(x){ +# mean(bodyfat[, x]) +#}) + +# bodyfat[ ,17] + +# Lets prevent to have an error: +# option 1: +sapply( UnknownVector, function(x){ + try(mean(bodyfat[, x])) +} ) + +# Option 2: +sapply( UnknownVector[ which( UnknownVector %in% 1:dim(bodyfat)[2] ) ], function(x){ + mean( bodyfat[, x] ) +} ) + +# Option 3: +options(show.error.messages = FALSE) +sapply( UnknownVector, function(x){ + to_return <- NA + column <- try(bodyfat[, x]) + if( class(column) == "try-error" ){ + message( paste0("The column ", x, " does not exist, I'm sorry :( ") ) + } else + to_return <- mean( column ) + return(to_return) +} ) +options(show.error.messages = TRUE) + + + + \ No newline at end of file