#install.packages('gutenbergr') #install.packages('wordcloud') #remotes::install_github("r-link/corrmorant") #install.packages('corrmorant') #install.packages("DataExplorer") #install.packages("inspectdf") #install.packages("skimr") #install.packages("corrr") #install.packages("PerformanceAnalytics") library(PerformanceAnalytics) library(skimr) library(corrr) library(inspectdf) library(DataExplorer) library(installr) library(sparklyr) library(dplyr) library(microbenchmark) library(tidyverse) library(DBI) library(tidytext) library(stringr) library(gutenbergr) library(wordcloud) library(tidytext) library(corrmorant) library(caret) library(e1071) ############################################# #Conectarea la spark Sys.setenv(JAVA_HOME = 'C:/Users/hp/Desktop/DM/zulu8.78.0.19-ca-jdk8.0.412-win_x64') #spark_disconnect(sc) sc <- spark_connect(master = "local") ############################################################################### # Incarcarea bazei de date ############################################################################### csv_path <- 'C:/Users/hp/Desktop/BIG data/10 R +Spark/measurements.csv' csv_path <- 'C:/Users/hp/Desktop/DM/measurements.csv' # Read the CSV file into a Spark DataFrame consum<- spark_read_csv(sc, name = "consum", path = csv_path, header = TRUE) ###################################################################### # Curatarea bazei de date ##################################################################### #Vizualizarea bazei de date glimpse(consum) #1.Inlocuim virgula cu punct consum <- consum %>% mutate( distance = regexp_replace(distance, ",", "."), consume = regexp_replace(consume, ",", "."), temp_inside = regexp_replace(temp_inside, ",", "."), refill_liters = regexp_replace(refill_liters, ",", ".") ) #2. Verifică valori lipsă consum%>% summarise(across(everything(), ~ sum(as.integer(is.na(.) )))) %>% collect() #Inlocuim valorile lipsa doar la variabila temp_inside #Calculează media pentru variabila temp_inside media_temp_inside <- consum %>% summarize(media_temp_inside = mean(temp_inside, na.rm = TRUE)) %>% collect() %>% pull() # Înlocuiește valorile lipsă cu media consum <- consum %>% mutate(temp_inside = ifelse(is.na(temp_inside), media_temp_inside, temp_inside)) # 3.Adaugarea unei noi variabile consum<- consum %>% mutate( speed_cat = case_when( speed < 32 ~ "small", speed <= 40 ~ "medium", TRUE ~ "high" ), AC = ifelse(AC == 1, "Yes", "No"), rain = ifelse(rain == 1, "Yes", "No"), sun = ifelse(sun == 1, "Yes", "No") ) # 4.Eliminarea variabilelor pe care nu le folosim consum <- consum %>% select(-refill_gas, -refill_liters, -specials,-speed) # Transformarea coloanelor necesare în numerice consum <-consum %>% mutate( distance = as.numeric(distance), consume = as.numeric(consume), temp_inside = as.numeric(temp_inside), temp_outside = as.numeric(temp_outside) ) # Verificare glimpse(consum) ################################################################################################ # EDA ################################################################################################ # Converteste setul de date Spark in data frame R consun <- collect(consum) ## Display the size of each column inspect_mem(consun) %>% show_plot () #... and as chart inspect_types(consun) %>% show_plot () # glimpse(airquality) inspect_na(airquality) %>% show_plot () # Informații de bază despre datele colectate temp <- DataExplorer::introduce(consun) # Vizualizare introductivă a datelor plot_intro(temp) # Vizualizarea valorilor lipsă plot_missing(consun) # Plot histogram for all numeric variables(histograme pentru toate variabilele numerice) DataExplorer::plot_histogram(consun) # Plot density curves for all numeric variables(crearea de curbe de densitate pentru toate variabilele numerice ) DataExplorer::plot_density(consun) #Analiza descriptiva pentru variabile numerice sdf_describe(consum, cols = c("distance", "consume",'temp_inside','temp_outside')) #Analiza descriptiva consun %>% skimr::skim() ## EDA a little bit low due the number of predictors consun%>% collect() %>% corrmorant() # Adding more details temp <-consun %>% collect() ggcorrm(data = temp) + lotri(geom_point(alpha = 0.5)) + lotri(geom_smooth(method = "lm")) + utri_heatmap() + utri_corrtext() + dia_names(y_pos = 0.15, size = 3) + dia_histogram(lower = 0.3, fill = "grey80", color = 1) + scale_fill_corr() + labs(title = "Correlation Plot") ## Plot correlations among numeric variables consun %>% DataExplorer::plot_correlation() ## Display the data types as text... temp <- inspectdf::inspect_types(consun) temp # the correlation plot consun %>% select_if(is.numeric) %>% corrr::correlate() %>% corrr::rplot() chart.Correlation(consun %>% select_if(is.numeric), histogram=TRUE, pch="+") #Asocierie intre tipul de gaz si aerul conditionat # Calculăm frecvențele în Spark contingency_table_spark <- consum %>% group_by(gas_type, AC) %>% summarise(count = n()) # Aducem tabela de contingență în R contingency_table <- collect(contingency_table_spark) # Efectuăm testul chi-pătrat în R chisq_result <- chisq.test(matrix(contingency_table$count, nrow = 2)) # Extragem p-value-ul din rezultatul testului chi-pătrat p_value <- chisq_result$p.value # Creăm graficul de bare cu adăugarea p-value-ului ggplot(data = contingency_table, aes(x = gas_type, y = count, fill = AC)) + geom_bar(stat = "identity", position = "dodge") + labs(x = "Tip de gaz", y = "Număr de mașini", fill = "Aer condiționat") + ggtitle("Asocierea între tipul de gaz și aerul condiționat") + geom_text(aes(label = paste("p-value =", round(p_value, 4))), x = 1, y = max(contingency_table$count) - 5, vjust = -1, hjust = -0.1, size = 4, color = "black") ############################################################################################### # Modelarea ############################################################################################### ############################################################################################### # Transformarea variabilei tintei ############################################################################################## #Transformam variabia tinta in binari consum <- consum %>% mutate(consume_binary = ifelse(consume > 5, 1, 0)) glimpse(consum) #Am transformat in double consum <- consum %>% mutate(consume_binary = as.double(consume_binary)) # Verificarea distribuției variabilei binare consum %>% group_by(consume_binary) %>% summarize(count = n()) %>% collect() %>% print() ############################################################################################### # Impartirea setului de date ############################################################################################### ### Prepare the training and test data # Partition into train and validate model_partition_tbl <- consum %>% sdf_random_split(train = 0.6, test = 0.4, seed = 1234) glimpse(model_partition_tbl) #Create table references consum_train <- sdf_register(model_partition_tbl$train, "consum_train") consum_test <- sdf_register(model_partition_tbl$test, "consum_test") ################################################################################ # Regresia logistica ################################################################################ ml_formula <- formula(consume_binary ~ .) # Logistic regression (ml_log <- ml_logistic_regression(consum_train, ml_formula)) ## Check Spark UI # Checking the predictions using the test dataset #p_ml_log <- sdf_predict(consum_test, ml_log) glimpse(p_ml_log) # Checking AUC ml_binary_classification_evaluator(p_ml_log) ################################################################################ # Arbore de decizie, Random Forest, Gradient Boosted Tree ################################################################################ #Antrenarea modelelor ## Arbore de decizie (ml_dt <-ml_decision_tree(consum_train, ml_formula)) summary(ml_dt) ## Random Forest (ml_rf <- ml_random_forest(consum_train, ml_formula)) summary(ml_rf) ## Gradient Boosted Tree (ml_gbt <- ml_gradient_boosted_trees(consum_train, ml_formula)) summary(ml_gbt) # Crearea listei care contine toate modelele: ml_models <- list( "Decision Tree" = ml_dt, "Random Forest" = ml_rf, "Gradient Boosted Trees" = ml_gbt ) # Crearea funcție pentru evaluarea performanței modelelor pe datele de test, score_model <- function(model){ pred <- ml_predict(model, consum_test) select(pred, consume_binary, prediction) } #Aplicarea funcției score_model pe fiecare model din lista ml_models ml_score <- lapply(ml_models, score_model) ml_score ################################################################################ # Analiza rezultatelor și compararea performanțelor modelelor ################################################################################ # Functia pentru calcularea acuratetei fiecărui model c_accur <- function(data, cutpoint = 0.5){ data %>% mutate(prediction = if_else(prediction > cutpoint, 1.0, 0.0)) %>% ml_multiclass_classification_evaluator("prediction", "consume_binary", "accuracy") } #Evaluarea fiecarui model sapply(ml_score, c_accur) sapply(ml_score, ml_binary_classification_evaluator, "consume_binary", "prediction") #Creating a DF and transforming the metrics in percents p_metrics <- data.frame( model = names(ml_score), AUC = 100 * sapply(ml_score, ml_binary_classification_evaluator, "consume_binary", "prediction"), Accuracy = 100 * sapply(ml_score, c_accur), row.names = NULL, stringsAsFactors = FALSE) # Plot results gather(p_metrics, metric, value, AUC, Accuracy) %>% ggplot(aes(reorder(model, value), value, fill = metric)) + geom_bar(stat = "identity", position = "dodge") + coord_flip() + xlab("") + ylab("Percent") + ggtitle("Performance Metrics") # Let's check the most performing model ml_tree_feature_importance(sc = sc, model = ml_gbt) ################################################################################################## # Alta varianta ################################################################################################# ml_formula <- formula(consume ~ distance+temp_inside+temp_outside+rain+sun+AC) compute_metrics <- function(model, test_data, response_var) { # Calculam predictiile pe setul de testare predictions <- ml_predict(model, test_data) # Calculam radacina erorii medie patratica (RMSE) rmse <- ml_regression_evaluator(predictions, label_col = response_var, metric_name = "rmse") # Calculam R-squared r_squared <- ml_regression_evaluator(predictions, label_col = response_var, metric_name = "r2") # Afisam RMSE si R-squared return(list(RMSE = rmse, R_squared = r_squared)) } glimpse(consum) ################################################################################ # Modelul de regresie liniara multiplu ############################################################################### model_lr <- ml_linear_regression(consum_train, formula = ml_formula) summary(model_lr) print(compute_metrics(model_lr, consum_test, "consume")) ############################################################################### # Modelul Random Forest ############################################################################### model_rf <- ml_random_forest(consum_train, formula = ml_formula) summary(model_rf) print(compute_metrics(model_rf, consum_test, "consume")) ############################################################################### # Gradient Boosted Tree ############################################################################### model_gbt <- ml_gradient_boosted_trees(consum_train, formula = ml_formula) summary(model_gbt) print(compute_metrics(model_gbt, consum_test, "consume")) ############################################################################### # Decision Tree ############################################################################### # Model de regresie cu arbori de decizie model_dt <- ml_decision_tree_regressor(consum_train, formula = ml_formula) summary(model_dt) print(compute_metrics(model_dt, consum_test, "consume")) ############################################################################### # Analiza rezultatelor și compararea performanțelor modelelor ########################################################################### #### Calculare metrici pentru fiecare model metrics_lr <- compute_metrics(model_lr, consum_test, "consume") metrics_rf <- compute_metrics(model_rf, consum_test, "consume") metrics_gbt <- compute_metrics(model_gbt, consum_test, "consume") metrics_dt <- compute_metrics(model_dt, consum_test, "consume") ###Afisare metrici pentru fiecare model sub forma de tabel all_metrics <- bind_rows( list(metrics_lr, metrics_rf, metrics_gbt, metrics_dt), .id = "Model") %>% mutate(Model = c("MLR", "Random Forest", "Gradient Boosted Tree", "Decision Tree")[as.integer(Model)]) # Definirea culorilor pentru fiecare model colors <- c("MLR" = "red", "Random Forest" = "blue", "Gradient Boosted Tree" = "green", "Decision Tree" = "orange") # Crearea graficului RMSE ggplot(all_metrics, aes(x = Model, y = RMSE, fill = Model)) + geom_bar(stat = "identity") + scale_fill_manual(values = colors) + labs(title = "Compararea modelelor", x = "Model", y = "RMSE") + theme_minimal() # Crearea graficului R_squared ggplot(all_metrics, aes(x = Model, y = R_squared, fill = Model)) + geom_bar(stat = "identity") + scale_fill_manual(values = colors) + labs(title = "Compararea modelelor", x = "Model", y = "R_squared") + theme_minimal() ############################################################################################ # Importanta variabilelor pentru fiecare model ############################################################################################ ############################################################################### # Modelul Random Forest ############################################################################### # Calcularea importanței variabilelor importance_rf <- ml_tree_feature_importance(sc = sc, model = model_rf) print(importance_rf) #Convertim datele de importanță a variabilelor într-un cadru de date importance_rf_df <- as.data.frame(importance_rf) # Creăm un grafic de bare pentru importanța variabilelor ggplot(importance_rf_df, aes(x = reorder(feature, -importance), y = importance)) + geom_bar(stat = "identity", fill = "skyblue") + labs(title = "Importanța variabilelor în modelul RandomForest", x = "Variabilă", y = "Importanță") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ############################################################################### # Gradient Boosted Tree ############################################################################### # Calcularea importanței variabilelor importance_gbt <- ml_tree_feature_importance(sc = sc, model = model_gbt) print(importance_gbt) # Convertim datele de importanță a variabilelor într-un cadru de date importance_gbt_df <- as.data.frame(importance_gbt) # Creăm un grafic de bare pentru importanța variabilelor ggplot(importance_gbt_df, aes(x = reorder(feature, -importance), y = importance)) + geom_bar(stat = "identity", fill = "skyblue") + labs(title = "Importanța variabilelor în modelul Gradient Boosted Trees", x = "Variabilă", y = "Importanță") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ############################################################################### # Decision Tree ############################################################################### # Calcularea importanței variabilelor pentru modelul Decision Tree importance_dt <- ml_tree_feature_importance(sc = sc, model = model_dt) print(importance_dt) # Convertim datele de importanță a variabilelor într-un cadru de date importance_dt_df <- as.data.frame(importance_dt) # Creăm un grafic de bare pentru importanța variabilelor ggplot(importance_dt_df, aes(x = reorder(feature, -importance), y = importance)) + geom_bar(stat = "identity", fill = "skyblue") + labs(title = "Importanța variabilelor în modelul Decision Tree", x = "Variabilă", y = "Importanță") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ############################################################################### # Validarea incrucisata pentru a evalua performanța # modelelor ############################################################################### #numărul de fold-uri k = 6 #Creearea unui vector de greutăți pentru fiecare fold weights <- rep(1 / k, times = k) # Denumește fiecare fold names(weights) <- paste0("fold", 1:k) # Împărțirea datelor în fold-uri s_consum_split <- sdf_random_split( consum, weights = weights, seed = 1234 ) glimpse(s_consum_split) #Crearea seturilor de antrenament și testare s_training_set <- do.call(rbind, s_consum_split[2:5]) s_test_set <- s_consum_split[1] # Crearea modelului Gradient Boosted Tree #s_gbt <- ml_gradient_boosted_trees(s_training_set,ml_formula) # Evaluarea perfomantii #s_gbt_p <- ml_predict(s_gbt,s_test_set$fold1) #s_gbt_p %>% # ml_binary_classification_evaluator("prediction", "consume", score = "probability" ) #1. Perfomanta perfecta #O funcție pentru a realiza validarea încrucișată #pentru modele de clasificare binară folosind Sparklyr ml_classification_cross_validation <- function( data, response, features = NULL, model_fun, k ) { # first off create weights for partitioning the data weights <- rep(1 / k, times = k) # name the elements of weights names(weights) <- paste0("fold", as.character(1:k)) # partition the data using weights data_cv <- sdf_random_split(data, weights = weights) # get the indicies for our different training sets # e.g. a dataFrame for partitions 1, 2, 3, and 4 K <- 1:k indices <- purrr::map(K, ~ K[-.x]) # create our training sets by binding together partitions of data_cv # We get back a list where each element is a dataFrame of the partions # indexed by each element of indices combined together data_splits <- purrr::map(indices, ~ sdf_bind_rows(data_cv[.x])) # If a vector of feature names hasn't been specified if (is.null(features)) { # Get the column names of the data columns <- colnames(data_splits[[1]]) # Create the feature names by using all the columns except the response features <- columns[columns != response] } # Map the specified model_fun over each of our training sets (the elements of # data splits) fits <- purrr::map(.x = data_splits, .f = purrr::as_mapper(model_fun), response = response, features = features ) preds <- purrr::map2(fits, K, ~ ml_predict(.x, data_cv[[.y]])) evals <- purrr::map(preds, ml_binary_classification_evaluator, "prediction", response, score = "probability") # This is what will be returned list(fits = fits, predictions = preds, evals = evals) } ##### Gradient Boosted Tree gbt_fold_gb <- ml_classification_cross_validation(consum, response = "consume", model_fun = ml_gradient_boosted_trees, k = 6) ##### Modelul Random Forest gbt_fold_rf<- ml_classification_cross_validation(consum, response = "consume", model_fun = ml_random_forest, k = 6) #### Decision Tree gbt_fold_dt<- ml_classification_cross_validation(consum, response = "consume", model_fun = ml_decision_tree_regressor, k = 6) ##### Gradient Boosted Tree mean(unlist(gbt_fold_gb$evals)) ##### Random Forest mean(unlist(gbt_fold_rf$evals)) ##### Decision Tree mean(unlist(gbt_fold_dt$evals)) ################################################################################################## # Pepiline ################################################################################################## consum_p <- sdf_copy_to(sc, consum, "consum_p", overwrite = TRUE) # Tranform all in numeric data type consum_p <- consum_p %>% mutate_all(as.numeric) ### Prepare the training and test data # Partitoin into train and validate model_partition_tbl <- consum_p %>% sdf_random_split(train = 0.8, test = 0.2, seed = 1234) glimpse(model_partition_tbl) # Create table references consum_p_train <- sdf_register(model_partition_tbl$train, "consum_p_train") consum_p_test <- sdf_register(model_partition_tbl$test, "consum_p_test") #Crearea și configurarea unui pipeline Sparklyr e_pipeline <- . %>% lm(consume_binary ~ distance + temp_inside, data = .) e_pipeline lm_model <- e_pipeline(consum_p) lm_model #Filtrarea datelor și transformarea cu dplyr consum_p_subset <- consum_p %>% filter(consume_binary > 0) ft_dplyr_transformer(sc, consum_p_subset) %>% ml_param("statement") # Adding this transformation in a pipeline and looking on the stages ml_pipeline(sc) %>% ft_dplyr_transformer( tbl = consum_p_subset ) # Adăugarea acestei transformări într-un pipeline și verificarea etapelor pipeline <- ml_pipeline(sc) %>% ft_dplyr_transformer( tbl = consum_p_subset ) # Verificarea etapelor pipeline-ului pipeline # Crearea pipeline-ului complet și antrenarea modelului pipeline <- ml_pipeline(sc) %>% ft_binarizer( input_col = "distance", output_col = "distance_bin_temp", threshold = 10000 ) %>% ft_binarizer( input_col = "temp_inside", output_col = "temp_inside_bin_temp", threshold = 95 ) %>% ft_r_formula(consume_binary ~ distance_bin_temp +temp_inside_bin_temp) %>% ml_logistic_regression() # Antrenarea pipeline-ului folosind datele de antrenament model <- ml_fit(pipeline, consum_p_train) # Efectuarea predicțiilor asupra setului de testare predictions <- ml_transform(model, consum_p_test) # Vizualizarea predicțiilor glimpse(predictions) # Analiza rezultatelor predicțiilor predictions %>% group_by(consume_binary, prediction) %>% tally() # Calcularea acurateței modelului accuracy <- predictions %>% mutate(correct = ifelse(consume_binary == prediction, 1, 0)) %>% summarise(accuracy = mean(correct)) accuracy #Modelul de regresie logistică antrenat pe acest pipeline a obținut o acuratețe de 59.7% #pe setul de testare ########################################################################### # H20 ############################################################################ write.csv(consum,'consum.csv') # H2O discussion #install.packages("rsparkling") #install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R"))) #install.packages("e:/###FEAA/2024/DM/h2o_3.44.0.3.zip", repos=NULL, type="source") library(rsparkling) library(h2o) # Inițializarea H2O localH2O <- h2o.init() # Setează directorul de lucru la locația fișierului CSV setwd("C:\Users\hp\Desktop\DM") # Importul datelor din fișierul CSV data <- h2o.importFile("consum.csv") # Vizualizarea datelor pentru a verifica importul glimpse(data) # Împărțirea datelor în 70% antrenament, 15% validare și 15% testare splits <- h2o.splitFrame(data = data, ratios = c(0.7, 0.15), # Împărțirea datelor în 70%, 15%, 15% seed = 1) # Setarea seed-ului pentru reproducibilitate train <- splits[[1]] valid <- splits[[2]] test <- splits[[3]] # Vizualizarea setului de antrenament pentru verificare glimpse(train) # Setarea variabilei țintă și a predictorilor y <- "consume" x <- setdiff(names(data), y) # Vizualizarea setului de antrenament pentru verificare print(colnames(train)) # Antrenarea modelului GBM folosind datele de antrenament gbm_fit <- h2o.gbm(x = x, y = y, training_frame = train, validation_frame = valid, model_id = "gbm_fit", seed = 1) # Evaluarea performanței modelului folosind datele de testare gbm_perf <- h2o.performance(model = gbm_fit, newdata = test) # Evaluarea performanței modelului folosind datele de testare gbm_perf <- h2o.performance(model = gbm_fit, newdata = test) # Afișarea performanței modelului gbm_perf # Calcularea și afișarea AUC-ului h2o.auc(gbm_perf) # Oprirea H2O h2o.shutdown(prompt = FALSE)