稍微修改一下原来封装发布Python回归模型的Shiny APP,就可以封装发布Tidy Models建立的回归模型,演示性质暂时只有SVM、随机森林、XGB 3种算法,从上一篇文章Tidy Models目前支持的算法模型中选出,因为Python上调试过,知道它们的性能不错。后面再研究怎样增加tidymodels对其它算法的支持。
一、运行效果
这个Shiny APP是Rstduio(现更名为Posit)大数据分析技术的全家桶,集成应用了它开发的主要先进R语言技术,包括shiny、tidymodels、Rmarkdown、dplyr、ggplot2等。通过IFrame嵌入Tomcat WEB APP中的方式与Java EE平台集成。从Java->Shiny->Tidy Models->Data直通到底。
二、源码
1、Melbourne_Tidy.R。
用经过贝叶斯调优的参数加载3种算法模型和经过预处理的数据,在训练集上训练模型,拟合训练集与测试集的预测输出,并收集各个算法的性能数据。由Shiny APP启动时加载。
library(tidymodels)
# 优先使用tidymodels的同名函数。
tidymodels_prefer()
# 加载经过预处理的数据
melbourne<- read.csv("/home/jean/data/Melbourne_housing_pre.csv")
# 过滤缺失值, 已经作过填零处理。
# Error: Missing data in columns: BuildingArea.
# 47 obs.
missing <- filter(melbourne, BuildingArea==0)
melbourne <- filter(melbourne, BuildingArea!=0)
# 划分训练集与测试集
set.seed(2023)
melbourne_split <- initial_split(melbourne, prop = 0.80)
melbourne_train <- training(melbourne_split)
melbourne_test <- testing(melbourne_split)
melbourne_metrics <- metric_set(rsq, rmse, mae)
# -----------------------------------------------------------------------------------------------
# 各种算法拟合的结果,先初始化为空data frame,后面再构建。
train = data.frame(origin = melbourne_train$LogPrice)
valid = data.frame(origin = melbourne_test$LogPrice)
perf = data.frame(algo=character(), train=numeric(), valid=numeric(), time=numeric())
# 用于显示异常值的原始数据
valid_X2 <- melbourne_test %>% select(-LogPrice)
train_X2 <- melbourne_train %>% select(-LogPrice)
# SVM best---------------------------------------------------------------------------------------
# 定义菜谱:回归公式与预处理
melbourne_rec<-
recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
+ Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
# 标准化数值型变量
step_normalize(all_numeric_predictors())
# 定义模型:SVM, 定义要调整的参数
svm_spec <-
svm_rbf(cost = tune(), rbf_sigma = tune(), margin = tune()) %>%
set_engine('kernlab') %>%
set_mode('regression')
# 定义工作流
svm_wflow <-
workflow() %>%
add_model(svm_spec) %>%
add_recipe(melbourne_rec)
# 构造最佳参数
svm_param_best<-
tibble(
cost=11.0,
rbf_sigma=0.0512,
margin=0.196
)
# 最佳参数回填到工作流
svm_wflow_bo <-
svm_wflow %>%
finalize_workflow(svm_param_best)
# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
svm_fit_bo<- svm_wflow_bo %>% fit(melbourne_train)
t2<-proc.time()
#生成训练、测试预测及性能数据
# 耗时
perf_svm <- c("svm",0.0,0.0,0.0)
temp <- t2-t1
perf_svm[[4]]<-round(temp[[3]],1)
# 测试集
# 预测值
melbourne_test_bo <- predict(svm_fit_bo, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_bo <- bind_cols(melbourne_test_bo, melbourne_test %>% select(LogPrice))
# 0.827, 0.219, 0.156
temp<-melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)
perf_svm[[3]]<-round(temp$.estimate[[1]]*100,2)
valid$svm <- melbourne_test_bo$.pred
# 训练集
# 预测值
melbourne_train_bo <- predict(svm_fit_bo, new_data = melbourne_train %>% select(-LogPrice))
# 合并真实值
melbourne_train_bo <- bind_cols(melbourne_train_bo, melbourne_train %>% select(LogPrice))
# 0.888, 0.179, 0.134
temp<-melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)
perf_svm[[2]]<-round(temp$.estimate[[1]]*100,2)
train$svm <- melbourne_train_bo$.pred
# 插入一行SVM的性能数据
perf[nrow(perf) + 1,]<- perf_svm
# Random Forest best---------------------------------------------------------------------------------------
# 定义菜谱:回归公式与预处理
melbourne_rec<-
recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
+ Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
# 标准化数值型变量
step_normalize(all_numeric_predictors())
# 定义模型:随机森林, 定义要调整的参数
rf_spec <-
rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>% # 模型主参数
set_engine("ranger",
regularization.factor = tune("regularization"), # 引擎相关参数
sample.fraction = tune("max_samples"),
max.depth = tune("max_depth")) %>% # 引擎相关参数
set_mode("regression")
# 定义工作流
rf_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(melbourne_rec)
# 构造最佳参数
rf_param_best<-
tibble(
mtry = 5,
trees = 1742,
min_n = 3,
regularization = 0.820,
max_samples = 0.968,
max_depth = 39
)
# 最佳参数回填到工作流
rf_wflow_bo <-
rf_wflow %>%
finalize_workflow(rf_param_best)
# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
rf_fit_bo<- rf_wflow_bo %>% fit(melbourne_train)
t2<-proc.time()
#生成训练、测试预测及性能数据
# 耗时
perf_rf <- c("rf",0.0,0.0,0.0)
temp <- t2-t1
perf_rf[[4]]<-round(temp[[3]],1)
# 测试集
# 预测值
melbourne_test_bo <- predict(rf_fit_bo, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_bo <- bind_cols(melbourne_test_bo, melbourne_test %>% select(LogPrice))
# 0.827, 0.219, 0.156
temp<-melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)
perf_rf[[3]]<-round(temp$.estimate[[1]]*100,2)
valid$rf <- melbourne_test_bo$.pred
# 训练集
# 预测值
melbourne_train_bo <- predict(rf_fit_bo, new_data = melbourne_train %>% select(-LogPrice))
# 合并真实值
melbourne_train_bo <- bind_cols(melbourne_train_bo, melbourne_train %>% select(LogPrice))
# 0.888, 0.179, 0.134
temp<-melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)
perf_rf[[2]]<-round(temp$.estimate[[1]]*100,2)
train$rf <- melbourne_train_bo$.pred
# 插入一行Random Forest的性能数据
perf[nrow(perf) + 1,]<- perf_rf
# XGB best---------------------------------------------------------------------------------------
# 定义菜谱:回归公式与预处理
melbourne_rec<-
recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
+ Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
# 标准化数值型变量
step_normalize(all_numeric_predictors())
# 定义模型:XGB, 定义要调整的参数
xgb_spec <-
boost_tree(tree_depth = tune(), trees = tune(), learn_rate = tune(), min_n = tune(), loss_reduction = tune(), sample_size = tune(), stop_iter = tune()) %>%
set_engine('xgboost') %>%
set_mode('regression')
# 定义工作流
xgb_wflow <-
workflow() %>%
add_model(xgb_spec) %>%
add_recipe(melbourne_rec)
# 构造最佳参数
xgb_param_best<-
tibble(
trees = 1792,
min_n = 13,
tree_depth = 8,
learn_rate = 0.0138,
loss_reduction = 0.0115,
sample_size = 0.334,
stop_iter = 11
)
# 最佳参数回填到工作流
xgb_wflow_bo <-
xgb_wflow %>%
finalize_workflow(xgb_param_best)
# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
xgb_fit_bo<- xgb_wflow_bo %>% fit(melbourne_train)
t2<-proc.time()
#生成训练、测试预测及性能数据
# 耗时
perf_xgb <- c("xgb",0.0,0.0,0.0)
temp <- t2-t1
perf_xgb[[4]]<-round(temp[[3]],1)
# 测试集
# 预测值
melbourne_test_bo <- predict(xgb_fit_bo, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_bo <- bind_cols(melbourne_test_bo, melbourne_test %>% select(LogPrice))
# 0.827, 0.219, 0.156
temp<-melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)
perf_xgb[[3]]<-round(temp$.estimate[[1]]*100,2)
valid$xgb <- melbourne_test_bo$.pred
# 训练集
# 预测值
melbourne_train_bo <- predict(xgb_fit_bo, new_data = melbourne_train %>% select(-LogPrice))
# 合并真实值
melbourne_train_bo <- bind_cols(melbourne_train_bo, melbourne_train %>% select(LogPrice))
# 0.888, 0.179, 0.134
temp<-melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)
perf_xgb[[2]]<-round(temp$.estimate[[1]]*100,2)
train$xgb <- melbourne_train_bo$.pred
# 插入一行Random Forest的性能数据
perf[nrow(perf) + 1,]<- perf_xgb
# 转换为数值类型
perf$train<- as.numeric(perf$train)
perf$valid<- as.numeric(perf$valid)
perf$time<- as.numeric(perf$time)
2、global.R。
调用上面的程序加载Tidy模型,并解决一些R包之间的函数命名冲突,运行library(tidymodels)后会提示所有的函数命名冲突。
library(conflicted)
library(ggplot2)
library(DT)
library(showtext)
# R Plot添加中文及其他字体[showtext]
# https://blog.csdn.net/weixin_46128755/article/details/125825935
showtext_auto()
# 加载library(tidymodels)后,R会自动检测函数名冲突,这里增加解决冲突的语句
# Warning: Error in <Anonymous>: [conflicted] `dataTableOutput` found in 2 packages.
# Warning: Error in <Anonymous>: [conflicted] `observe` found in 2 packages.
# Warning: Error in <Anonymous>: [conflicted] `renderDataTable` found in 2 packages.
conflict_prefer("dataTableOutput", "DT")
conflict_prefer("renderDataTable", "DT")
conflict_prefer("observe", "shiny")
# 确定各回归模型脚本的目录位置
print(getwd())
path<- "../../scripts/Melbourne_Tidy.R"
print(path)
# 装入脚本,用经过预处理的数据集,在装入时一次性加载所有Tidy回归模型并完成训练集与验证集的拟合。
t3<- proc.time()
source(path)
t4<- proc.time()
# 总耗时、系统耗时、用户耗时,...,...
# An object of class "proc_time" which is a numeric vector of length 5,
# containing the user, system, and total elapsed times for the currently running R process,
# and the cumulative sum of user and system times of any child processes
# spawned by it on which it has waited.
seconds<-t4-t3
# 各模型训练与验证的结果,封装为data frame
# train,训练集拟合结果。
# valid,测试集拟合结果。
# perf,各模型的性能数据。
# 回归算法列表
algos<-names(train)
# 这个是显示在下拉列表中供选择的算法名称,作为命名列表的名字,它的内容是dataframe的列名。
# 从第2项开始。
names(algos)<-c("Origin","SVM","RandomForest","XGB")
3、ui.R。
稍作修改,主要是算法列表不同了。
# 浏览器端UI函数。
fluidPage(
# Javascript 处理父窗口传入的参数等。
tags$head(
tags$script(HTML("
// 记录父窗口,初始为空。
parent = null;
// 处理接收到的消息。
window.addEventListener('message', function(e) {
//alert(e.data);
try{
//记录父窗口以备回发信息
parent = e.source;
//向服务器发送input变量更新消息
Shiny.setInputValue('algo', e.data);
} catch (error){
alert(error);
}
},false);
// 点击时传出参数
$(document).on('click', '.btn-success', function (evt) {
evt.preventDefault();
var selected = document.getElementById('rows_selected');
if (selected ==null || selected.value.length==0){
alert('没有选中的行!');
}else{
if (parent == null){
alert(selected.value);
} else {
try{
parent.postMessage(selected.value, '*');
} catch(error){
alert(error);
}
}
}
});
"))
),
sidebarLayout(
# Sidebar with a selectioninput and a numericInput
sidebarPanel(
h3("墨尔本房价回归模型示例"),
selectInput(
'algo',
'回归算法',
names(algos)[2:4],
selected = 'XGB'
),
sliderInput("threshold",
"异常值阀值%:",
min = 20, max = 40, value = 30),
HTML("<br>总耗时<br>"),
textOutput("seconds"),
HTML("<br>模型性能<br>"),
tableOutput("performance"),
HTML("说明:这是对ln(房价),即房价自然对数的回归,上表的ACC指标也是如此,因为ln(房价)接近正态分布,这样回归精度更高。<br>
exp(Predict)换算回原量纲后,回归精度略有下降,也接近88%。"),
HTML("<br><br><br><br><br><br><br><br><br><br><br><br>"),
textAreaInput("rows_selected","已选择异常值行号", rows =6, resize="vertical", value =""),
# 插入javascript,禁止自己修改 rows_selected textAreaInput
tags$script(HTML("
var rows_selected = document.getElementById('rows_selected');
rows_selected.disabled = true;
")),
HTML("<br>"),
actionButton("sendout", "传出选中的行", class = "btn-success"),
HTML("<br>"),HTML("<br>"),
downloadButton("report","下载分析报告")
),
# Show the network
mainPanel(
# 画拟合效果
h3(textOutput("parameters")),
HTML("<br>"),
imageOutput("trainPlot"),
HTML("<br>"),
HTML("<div style='text-align: center;'>"),
imageOutput("validPlot"),
HTML("</div>"),
HTML("<br><h3>验证集预测值异常(阀值以外区域)样本</h3><br>"),
dataTableOutput("outliers")
)
)
)
4、server.R。
没有修改。
# 输出图形到SVG临时文件以支持中文标注
# https://stackoverflow.com/questions/39093777/renderimage-and-svg-in-shiny-app/39263320#39263320
# 多图层图例设置参阅
# https://stackoverflow.com/questions/18394391/custom-legend-for-multiple-layer-ggplot
# Shiny data table国际化语言支持参阅
# https://rstudio.github.io/DT/004-i18n.html
# Shiny Data Table的操作可以参考
# https://rstudio.github.io/DT/shiny.html
# 选中行的操作可以参考
# https://yihui.shinyapps.io/DT-rows/
# ggplot2中文支持可以参考
# https://blog.csdn.net/weixin_46128755/article/details/125825935
# https://statisticsglobe.com/change-font-size-of-ggplot2-plot-in-r-axis-text-main-title-legend
shinyServer(function(input, output, session) {
algo<- reactive({
input$algo
})
threshold<- reactive({
input$threshold
})
# 更新由宿主页面传入的参数
observe({
print(algo())
# print(threshold())
# 更新由宿主页面传入的参数
updateTextInput(session, "algo", value = algo())
# 注意这里用updateTextInput更新文本到浏览器端,如果用updateSelectInput()会报错,可能要求的是下标。
# updateSelectInput(session, "algo", value = algo())
})
output$parameters<-renderText({
paste("算法:",algo()," 异常值阀值:",threshold(),"%",sep="")
})
output$seconds<-renderText({
seconds
})
output$performance<- renderTable({
names(perf)<-c("算法","训练集","验证集","耗时")
perf
},
digits = 3
)
# 第一种ggplot2作图支持中文的方法
output$trainPlot<-renderPlot({
colName<- algos[algo()]
data<- data.frame(origin = train["origin"], predict = train[colName])
names(data)<-c("origin","predict")
data<-data[order(data["origin"]),]
data["index"]<-1:nrow(data)
data["upper"]<-data["origin"]+log(1+threshold()/100)
data["lower"]<-data["origin"]+log(1-threshold()/100)
p<- ggplot(data = data)+
geom_point(mapping = aes(x = index, y = predict, color = "blue"), size=0.1)+
geom_line(mapping = aes(x = index, y = origin, color = "red"))+
geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
ggtitle("训练集拟合效果图")+
xlab("样本")+
ylab("ln(房价)")+
scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
scale_colour_manual(name = '颜色',
values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))+
theme(plot.title = element_text(size = 24), legend.title=element_text(size=18),
legend.text=element_text(size=14), axis.title = element_text(size = 18),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
gridExtra::grid.arrange(egg::set_panel_size(p=p, width=unit(6, "in"), height=unit(4, "in")))
})
# 第二种ggplot2作图支持中文的方法
output$validPlot<-renderImage({
colName<- algos[algo()]
data<- data.frame(origin = valid["origin"], predict = valid[colName])
names(data)<-c("origin","predict")
data<-data[order(data["origin"]),]
data["index"]<-1:nrow(data)
data["upper"]<-data["origin"]+log(1+threshold()/100)
data["lower"]<-data["origin"]+log(1-threshold()/100)
image<-ggplot(data = data)+
geom_point(mapping = aes(x = index, y = predict, color = "blue"), size=0.1)+
geom_line(mapping = aes(x = index, y = origin, color = "red"))+
geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
ggtitle("验证集拟合效果图")+
xlab("样本")+
ylab("ln(房价)")+
scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
scale_colour_manual(name = '颜色',
values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))
# 输出SVG到临时文件
# This file will be removed later by renderImage
outfile <- tempfile(fileext='.svg')
#This actually save the plot in a image
#ggsave(file=outfile, plot=image, width=mysvgwidth, height=mysvgheight)
ggsave(file=outfile, plot=image, width=6, height=4)
# Return a list containing the filename
list(src = normalizePath(outfile),
contentType = 'image/svg+xml',
#width = width,
height = 400,
alt = "验证集拟合效果图")
},
# SVG图发到客户端后,renderImage删除临时文件
deleteFile = TRUE
)
output$outliers<- renderDataTable({
colName<- algos[algo()]
data<- data.frame(origin = valid["origin"], predict = valid[colName])
names(data)<-c("origin","predict")
data["outlier"]<- FALSE
data[which(data["predict"] < data["origin"]+log(1-threshold()/100)),"outlier"] <-TRUE
data[which(data["predict"] > data["origin"]+log(1+threshold()/100)),"outlier"] <-TRUE
valid_X3<- valid_X2[,c("Lattitude","Longtitude","Distance","BuildingArea","Landsize","YearBuilt","Year")]
valid_X3["Type"]<-valid_X2["Type_h"]*4+valid_X2["Type_t"]*2+valid_X2["Type_u"]
valid_X3[which(valid_X3["Type"]==4),"Type"]<-"H"
valid_X3[which(valid_X3["Type"]==2),"Type"]<-"T"
valid_X3[which(valid_X3["Type"]==1),"Type"]<-"U"
#valid_X3["Origin"]<-data["origin"]
#valid_X3["Predict"]<-data["predict"]
# 转换为原量纲
valid_X3["Origin"]<-round(exp(data["origin"]),2)
valid_X3["Predict"]<-round(exp(data["predict"]),2)
outliers<-valid_X3[which(data["outlier"]==TRUE),]
outliers["SE"]<-round((outliers["Predict"] - outliers["Origin"])/outliers["Origin"],3)
#outliers["ID"]<- row.names(outliers)
outliers<-outliers[order(outliers["SE"]),]
# 放入 session 中以便后面选择行时引用,不能用行索引,它不是全局的, 行名才是全局的。
session$userData$outliers<- outliers
},
#rownames = FALSE,
options = list(
pageLength = 10,
#language = list(url = '//cdn.datatables.net/plug-ins/1.10.11/i18n/Chinese.json')
language = list(url = 'Chinese.json')
)
)
# 返回并显示选中行的行名,行名是全局的,用于定位对应的行,更新浏览器端显示。
observe({
outliers<- session$userData$outliers
rows<- outliers[input$outliers_rows_selected,]
outliersStr<- paste(row.names(rows), collapse=",")
updateTextAreaInput(session, "rows_selected", value = outliersStr)
})
# 下载分析报告,output$id与downloadButton(id)对应。
output$report <- downloadHandler(
filename = "report.pdf",
content = function(file) {
# 训练集拟合数据
colName<- algos[algo()]
data<- data.frame(origin = train["origin"], predict = train[colName])
names(data)<-c("origin","predict")
data<-data[order(data["origin"]),]
data["index"]<-1:nrow(data)
data["upper"]<-data["origin"]+log(1+threshold()/100)
data["lower"]<-data["origin"]+log(1-threshold()/100)
trainSelected<-data
# 验证集拟合数据
colName<- algos[algo()]
data<- data.frame(origin = valid["origin"], predict = valid[colName])
names(data)<-c("origin","predict")
data<-data[order(data["origin"]),]
data["index"]<-1:nrow(data)
data["upper"]<-data["origin"]+log(1+threshold()/100)
data["lower"]<-data["origin"]+log(1-threshold()/100)
validSelected<-data
# 异常值数据
colName<- algos[algo()]
data<- data.frame(origin = valid["origin"], predict = valid[colName])
names(data)<-c("origin","predict")
data["outlier"]<- FALSE
data[which(data["predict"] < data["origin"]+log(1-threshold()/100)),"outlier"] <-TRUE
data[which(data["predict"] > data["origin"]+log(1+threshold()/100)),"outlier"] <-TRUE
valid_X3<- valid_X2[,c("Lattitude","Longtitude","Distance","BuildingArea","Landsize","YearBuilt","Year")]
valid_X3["Type"]<-valid_X2["Type_h"]*4+valid_X2["Type_t"]*2+valid_X2["Type_u"]
valid_X3[which(valid_X3["Type"]==4),"Type"]<-"H"
valid_X3[which(valid_X3["Type"]==2),"Type"]<-"T"
valid_X3[which(valid_X3["Type"]==1),"Type"]<-"U"
#valid_X3["Origin"]<-data["origin"]
#valid_X3["Predict"]<-data["predict"]
# 转换为原量纲
valid_X3["Origin"]<-round(exp(data["origin"]),2)
valid_X3["Predict"]<-round(exp(data["predict"]),2)
outliers<-valid_X3[which(data["outlier"]==TRUE),]
outliers["SE"]<-round((outliers["Predict"] - outliers["Origin"])/outliers["Origin"],3)
#outliers["ID"]<- row.names(outliers)
outliers<-outliers[order(outliers["SE"]),]
# 生成参数列表,传入报告所需的参数。
params <- list(
algo = input$algo,
threshold = input$threshold,
rows = input$rows_selected,
perf = perf,
train = trainSelected,
valid = validSelected,
outliers = outliers
)
id <- showNotification(
"正在生成报告...",
duration = NULL,
closeButton = FALSE
)
on.exit(removeNotification(id), add = TRUE)
rmarkdown::render("report.Rmd",
output_file = file,
params = params,
envir = new.env(parent = globalenv())
)
}
)
})
5、report.Rmd。
没有修改。拷贝运行时注意要把R代码块标记符中插入的"/"删去,这里要用markdown语法来展示markdown脚本,插入该"/"号以避免代码块标记混乱。注意R代码块外的"#"在Rmarkdown语法中是标题而不是注释。
---
title: "墨尔本房价分析报告"
author: "Jean"
date: "`r Sys.Date()`"
header-includes:
- \usepackage{ctex}
output:
pdf_document:
latex_engine: xelatex
params:
algo: NA
threshold: NA
rows: NA
perf: NA
train: NA
valid: NA
outliers: NA
---
# 各回归算法性能数据:
`/``{r}
perf<-params$perf # 传入参数被锁定不能改变,赋值其它变量。
perf[,-1]<- round(perf[,-1],3) # 除第一列算法名称外,舍入至小数点后3位。
names(perf)<-c("算法","训练集","验证集","耗时(秒)")
knitr::kable(perf)
`/``
# 算法:`r params$algo` ,异常值阀值:`r params$threshold`%。
## 训练集拟合效果
`/``{r fig.showtext = TRUE}
library(ggplot2)
library(showtext)
# Rmarkdown安装配置、输出PDF正文中文以及图片中文配置
# https://blog.csdn.net/weixin_46128755/article/details/125825935
# 设置showtext_auto(),打开ggplot2中文支持。
showtext_auto()
data<-params$train
ggplot(data = data)+
geom_point(mapping = aes(x = index, y = predict, color = "blue"), size=0.1)+
geom_line(mapping = aes(x = index, y = origin, color = "red"))+
geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
ggtitle("训练集拟合效果图")+
xlab("样本")+
ylab("ln(房价)")+
scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
scale_colour_manual(name = '颜色',
values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))
`/``
## 验证集拟合效果
`/``{r fig.showtext = TRUE}
data<-params$valid
ggplot(data = data)+
geom_point(mapping = aes(x = index, y = predict, color = "blue"), size=0.1)+
geom_line(mapping = aes(x = index, y = origin, color = "red"))+
geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
ggtitle("验证集拟合效果图")+
xlab("样本")+
ylab("ln(房价)")+
scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
scale_colour_manual(name = '颜色',
values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))
`/``
## 异常值列表
`/``{r}
outliers<-head(params$outliers,10) # 演示性质,只打印前10行。
names(outliers)<-c("纬度","经度","距离","建筑面积","占地面积","建成","交易","类型",
"价格","预测","偏差")
knitr::kable(outliers)
`/``
## 选中的异常值行号:`r params$rows`。
6、tidy.jsp。
把Shiny APP嵌入集成到Tomcat Web APP的网页中。拷贝Python后端的版本稍微修改一下,一是算法列表不同了,二是引用的Shiny APP网址也不同。
<%@ page language="java" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>墨尔本房价回归示例</title>
<script type="text/javascript">
//接受消息
window.addEventListener('message', receiveMessage);
function receiveMessage(event) {
//alert(event.data);
var outliers = document.getElementById('outliers');
var message = event.data.trim();
outliers.value = message;
if (message.length >24){
outliers.size = message.length;
}else{
outliers.size = 24;
}
}
//发送消息
function setAlgo(){
try{
var algo = document.getElementById("algo");
//alert(algo.value);
// document.getElementById("shinyappframe") 失败。
var win = document.getElementsByTagName('iframe')[0].contentWindow;
// postMessage(),窗口间通信,不受浏览器同源规则限制,可用于任何内嵌的iframe。
// 第二个参数是接受窗口的源,协议+域名+端口,可用*表示发给所有窗口。
win.postMessage(algo.value,'*');
//alert("Sent!");
} catch (error) {
alert(error);
}
}
</script>
</head>
<body>
<table>
<tr><h2>墨尔本房价回归示例 Tidy Modeling</h2></tr>
<tr>
</tr>
<tr>
<td>
<form action="details.jsp" method="post">
<label>回归算法:</label>
<select name="algo" id="algo">
<option value="SVM">SVM</option>
<option value="RandomForest">RandomForest</option>
<option value="XGB" selected>XGB</option>
</select>
<input type="button" value="识别异常房价" onclick="setAlgo()" /><br>
房价异常行号:
<input id="outliers" name="outliers" type="text" value="" size=24></input><br>
<input type="submit" value="进一步处理" />
</form>
</td>
<td>
使用说明:<br>
1、点击下拉列表选取要使用的回归算法。<br>
2、点击下拉列表旁边的“识别异常房价”按钮传入Shiny APP识别。<br>
3、可以调整异常值阀值以改变异常值的范围。<br>
4、在页面底部的异常房价列表中选择要传出处理的行,可以翻页选中多行(点击)或取消选中某行(重复点击)。<br>
5、点击“传出选中的行”按钮把识别结果传送到上方的房价异常行号输入框。<br>
6、点击“进一步处理”提交到结果页显示或进一步处理,演示性质只是简单按行号匹配原始数据并显示。<br>
</td>
</tr>
</table>
<iframe id="shinyappframe" src="https://jeanye.cn:4443/shiny/users/jean/tidy/" style="border: 1px solid #AAA; width: 100%; height: 1000px"></iframe>
</body>
</html>
7、details.jsp。
增加一个返回tidy版首页的链接。
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%>
<%@ page import="java.util.Set,java.util.HashSet,java.io.BufferedReader,java.io.FileReader"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>异常房价详情</title>
</head>
<body>
<h3>异常房价</h3>
<p>识别结果:<br>
回归算法:<%=new String(request.getParameter("algo").getBytes("ISO-8859-1"),"UTF-8") %><br>
房价异常行号:<%=new String(request.getParameter("outliers").getBytes("ISO-8859-1"),"UTF-8") %><br>
<h3>匹配的原始数据</h3>
<table>
<tr>
<%
String outliers[] = (new String(request.getParameter("outliers").getBytes("ISO-8859-1"),"UTF-8")).split(",");
Set<Integer> set = new HashSet<Integer>();
for (int i=0; i< outliers.length; i++){
set.add(Integer.parseInt(outliers[i]));
}
String path = request.getRealPath(".");
System.out.println(path);
BufferedReader reader = new BufferedReader(new FileReader(path+"/Melbourne_housing_pre.csv"));
// skip header
String header[] =reader.readLine().split(",");
for (int i=0; i<header.length;i++){
%>
<td><%=header[i]%></td>
<%}%></tr>
<%
String line = null;
int index=0;
//读取每行,直到为空
while((line=reader.readLine())!=null){
if (set.contains(index)){
String items[] = line.split(",");
%><tr><%
for (int i=0; i< items.length; i++){
%><td><%=items[i] %></td>
<%
}%></tr><%
}
index++;
}
%>
</table>
<br>
<a href="index.jsp">返回首页(Python模型)</a> <a href="tidy.jsp">返回首页(Tidy模型)</a>
</body>
</html>
现改现用,一个下午搞定,体验地址。