13 Star 65 Fork 18

DataManagement / Spark-Autotuning

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
Global.java 12.56 KB
一键复制 编辑 原始数据 按行查看 历史
wengchangsu 提交于 2019-09-30 09:23 . v0.1
package others;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.omg.CORBA.PUBLIC_MEMBER;
import com.mysql.cj.exceptions.StatementIsClosedException;
import shellInterface.UserSubmitInterface_test;
/**
*@author tonychao
*记录全局的一些系统设置的默认值,可以通过/conf 文件夹下的配置文件重载这些配置
*需要修改运行环境时注意: DB, Histoy server,Python 文件的DB都要修改
*
*/
public class Global {
//为了配合系统验收,每次调用都返回yes,正常默认状态应该是两个false
public static boolean ALWAYS_TUNE= true; //不询问用户,每次都调优
public static boolean ALWAYS_NOT_TUNE= false; //不询问用户,每次都不调
public static SparkTuneOption DEFAULT_OPTION= SparkTuneOption.NOT_SET; //默认情况下由历史数据自动选择
public static Boolean RECORD_LOG_INTO_HISTORY = true; //默认情况下,优化过的应用参数直接记录
//采用混合优化方式的情况下使用的参数
public static double HYBIRD_K=3.16;
public static double HYBIRD_p=0.9;
//系统全局变量
public static boolean SUBMIT_COMMAND_TO_SYSYTEM= true;//是否真正提交命令
public static boolean DEBUG_FLAG= true; //输出级别调整
public static final int HDFS_BLOCK_SIZE_IN_MB=128; //HDFS的块大小,默认为128M
//模型训练配置
public static int RETRAIN_COUNT=1;//有多少新的数据之后启动重新训练
public static int WAIT_TIME_IN_MS=100000;//等待多长时间之后向历史服务器寻找记录通常要长于1分钟,视版本而定
public static final int MIN_FACTOR=1;//为了使得任务相对安全,只有是这个factor的整数倍的记录才触发训练 ,这个参数在使用数据文件的情况下不生效
//依赖的配置文件路径,目前配置文件决定调优参数的范围
public static final String CONF_PATH="conf/";
public static final String HIT="searchDistrict.json"; //科学园大集群参数范围表
public static final String THU="THUSearchDistrict.json";//清华集群参数范围表
public static final String HIT_TINY="HIT_tiny_cluster_SearchDistrict.json"; //小集群范围表
public static String PARAMETERT_DISTRICT_CONF_FILE=HIT_TINY;
//Spark历史服务器参数
public static final String THU_SPARK_HISTORY_SERVER="http://192.168.130.18:18080/";
public static final String LOCAL_UBUNTU_SPARK_HISTORY_SERVER="http://127.0.0.1:18080/";
public static final String TINY_CLUSTER_SPARK_HISTORY_SERVER="http://192.168.1.50:18080/";
public static final String XGD_CLUSTER_SPARK_HISTORY_SERVER="http://172.19.0.134:18080/";
public static String CURRENT_SPARK_HISTORY_SERVER=XGD_CLUSTER_SPARK_HISTORY_SERVER;
//mysql数据库服务,在项目结题版本中这一项被废弃,因为开源的原因
public static boolean USE_DB_Flag =true;
public static final String JDBC_LOCAL_UBUNTU_DB="jdbc:mysql://127.0.0.1:3306/sparkTuningDB";
public static final String JDBC_THU_CLUSTER_DB="jdbc:mysql://:3306/sparkTuningDB";
public static final String TINY_CLUSTER_DB="jdbc:mysql://192.168.1.50:3306/sparkTuningDB";
public static final String XGD_CLUSTER_DB="jdbc:mysql://10.69.35.174:32100/sparkTuningDB";//?connectTimeout=3000&socketTimeout=60000
public static String CURRENT_DB=XGD_CLUSTER_DB;
public static String USER_NAME="tonychao";
public static String PASS_WORD="sql";
//文件依赖
public static final String PYTHON_MAKE_MODEL_SCRIPT_DB="."+File.separator+"PythonFiles/trainModelFromDB.py";
public static final String PYTHON_MAKE_MODEL_SCRIPT_DATAFILE="."+File.separator+"PythonFiles/trainModelFromData2.py";
public static final String PYTHON_DAEMON_PATH_FROM_DB="."+File.separator+"PythonFiles/daemon_fromDB.py";
public static final String PYTHON_DAEMON_PATH_FROM_FILE="."+File.separator+"PythonFiles/daemon_from_File2.py";
public static final String INPUT_COMMAND_PATH = "."+File.separator+"testfile/run.sh";
public static final String DUMMY_RECORD_COMMAND_PATH = "."+File.separator+"generateDummyRecord.sh";
public static final String INIT_COMMAND_PATH = "."+File.separator+"command.sh";
public static final String DATA_FILE_ROOT_PATH="."+File.separator+"historyDataWhenDBnotAvaliable"+File.separator;
public static final String MODEL_PATH="."+File.separator+"savedModels"+File.separator;
//启发式RRS搜索参数
public static int RANDOM_SEARCH_COUNT=7000;//因为统计规律决定 大约每43次随机搜索才能找到一个比统计值要好的,所以exploit time 大约是 RANDOM_SEARCH_TIME的 43分之一
public static long MAX_SEARCH_TIME_MS = 15000;
//搜索每一组的参数并将其转换为int型,注意,其顺序由GlobalStatics 中的次序给次给定,且必须被严格遵守
public static final String[] CHOSEN_PARAMETERS_GROUP0=new String[]{ //启发式确定的参数
"spark.task.cpus",
"spark.executor.cores",
"spark.executor.memory",
"spark.driver.cores",
"spark.driver.memory",
// "spark.default.parallelism" //因为没有确定的默认值,所以不打印
};
public static final String[] CHOSEN_PARAMETERS_GROUP1=new String[]{ //非启发式的参数
"spark.reducer.maxSizeInFlight", //shuffle read&write
"spark.shuffle.compress",
"spark.shuffle.spill.compress",
"spark.shuffle.file.buffer",
"spark.broadcast.compress",// broadcast
"spark.broadcast.blockSize",
"spark.rpc.message.maxSize",
"spark.memory.fraction", //memory
"spark.memory.storageFraction",
"spark.rdd.compress", //compress
"spark.io.compression.codec"
};
public static final String[] CHOSEN_PARAMETERS_GROUP2=new String[]{ //非启发式的参数
"dummy"
};
public static final String[] CHOSEN_PARAMETERS_GROUP3=new String[]{ //非启发式的参数
"dummy"
};
//集群的配置信息
public static final ArrayList<Integer> CORE_TINY_CLUSTER = new ArrayList<Integer>(Arrays.asList(4,4,4)); //每个节点的核心数
public static final ArrayList<Integer> MEMORY_TINY_CLUSTER = new ArrayList<Integer>(Arrays.asList(4096,4096,4096)); //每个节点的内存,单位为MB
public static final ArrayList<Integer> CORE_TINY_CLUSTER2 = new ArrayList<Integer>(Arrays.asList(8,32)); //每个节点的核心数
public static final ArrayList<Integer> MEMORY_TINY_CLUSTER2 = new ArrayList<Integer>(Arrays.asList(8192/2,8192/2)); //每个节点的内存,单位为MB
public static final ArrayList<Integer> CORES = CORE_TINY_CLUSTER2;
public static final ArrayList<Integer> MEMORYS_IN_MB =MEMORY_TINY_CLUSTER2;
public static final int CORE_MASTER = 8;
public static final int MEMORY_MASRTER = 4096*3;
//返回基本配置信息
public static String info(){
String str="############################\n"+
"DEBUG_FLAG="+DEBUG_FLAG+"\n"+
"PARAMETERT_DISTRICT_CONF="+PARAMETERT_DISTRICT_CONF_FILE +"\n"+
"CURRENT_SPARK_HISTORY_SERVER="+CURRENT_SPARK_HISTORY_SERVER+"\n"+
"CURRENT_DB="+CURRENT_DB+"\n"+
"############################\n";
return str;
}
@SuppressWarnings("static-access")
public Global() throws JSONException, IOException{
this.loadFromConfFile("."+File.separator+"conf"+File.separator+"netWorkConf.json");
}
//spark 部署模式 目前支持standalone 和 on yarn 两种模式
public enum SparkMode{
SPARK_STAND_ALONE,SPARK_ON_YARN
}
public static SparkMode DEFAULT_MODE=SparkMode.SPARK_ON_YARN;//默认的部署模式
//spark 优化模式
public enum SparkTuneOption{ //注意:启发式模型对Spark standalone 模式无效
HURISTIC_ONLY, //仅使用启发式优化参数
GRAYBOX_ONLY, //仅仅使用灰盒模型
HURISTIC_AND_GRAYBOX, //启发式优化参数+灰盒方法优化其他参数
NOT_SET, //说明调优模式根据实际情况指定为 HURISTIC_ONLY or HURISTIC_AND_GRAYBOX or GRAYBOX_ONLY
HYBIRD //采用一种智能选择策略,来实现过程选择,实际上就是 有模型的情况下在 GRAYBOX_ONLY 和HURISTIC_AND_GRAYBOX 中选择一个,在模型的情况下使用HURISTIC_ONLY
}
//从配置文件的位置重载一些配置
private static void loadFromConfFile(String filePath) throws JSONException, IOException{
//从文件读取数据
BufferedReader reader;
String line="";
String str="";
try {
reader = new BufferedReader(new FileReader(filePath));
while ((line=reader.readLine())!=null){
if (line.trim().startsWith("//")) continue; //注释
str+=line;
}
reader.close();
} catch (FileNotFoundException e) {
System.err.println("file not found");
e.printStackTrace();
return;
}
JSONArray dataJson= new JSONArray(str);
for (int i=0;i<dataJson.length();i++){ //检查配置文件是否应该生效
if(dataJson.getJSONObject(i).getString("name").equals("use_this_conf_file")){
boolean isValid=dataJson.getJSONObject(i).getBoolean("VALUE");
if (isValid) {
UserSubmitInterface_test.UIOutPut("从配置文件载入设置");
break;
}
else {
UserSubmitInterface_test.UIOutPut("不从配置文件载入设置");
return; //不生效就跳过整个文件的执行过程
}
}
}
//挨个解析结果
for(int i =0;i<dataJson.length();i++){
JSONObject tmp= dataJson.getJSONObject(i);
String name= tmp.getString("name");
switch (name) {
case "spark_history_server":
String ip=tmp.getString("IP");
String port=tmp.getString("PORT");
Global.CURRENT_SPARK_HISTORY_SERVER=ip+":"+port+"/";
break;
case "database_service":
String url=tmp.getString("URL");
String user=tmp.getString("USER");
String password=tmp.getString("PASSWORD");
Global.CURRENT_DB=url;
Global.USER_NAME=user;
Global.PASS_WORD=password;
break;
case "random_search_count":
int randomSearchCount=tmp.getInt("VALUE");
Global.RANDOM_SEARCH_COUNT=randomSearchCount;
break;
case "random_search_time_ms":
Long randomSearchTimeMS=tmp.getLong("VALUE");
Global.MAX_SEARCH_TIME_MS=randomSearchTimeMS;
break;
case "use_database_or_file":
String op=tmp.getString("VALUE");
if (op.toLowerCase().equals("file")) { //不使用DB进行优化
Global.USE_DB_Flag=false;
}
else { //使用DB进行优化
Global.USE_DB_Flag=true;
}
break;
case "use_this_conf_file": //已经处理完了,直接跳过
break;
case "always_tune": //已经处理完了,直接跳过
boolean ret=dataJson.getJSONObject(i).getBoolean("VALUE");
Global.ALWAYS_TUNE=ret;
break;
case "always_not_tune": //已经处理完了,直接跳过
Global.ALWAYS_NOT_TUNE=dataJson.getJSONObject(i).getBoolean("VALUE");
break;
case "WAIT_TIME_IN_MS":
Global.WAIT_TIME_IN_MS=dataJson.getJSONObject(i).getInt("VALUE");
break;
case "PARAMETERT_DISTRICT_CONF_FILE":
Global.PARAMETERT_DISTRICT_CONF_FILE=dataJson.getJSONObject(i).getString("VALUE");
break;
case "DEBUG_FLAG":
Global.DEBUG_FLAG=dataJson.getJSONObject(i).getBoolean("VALUE");
break;
case "DEFAULT_OPTION":
String str1=dataJson.getJSONObject(i).getString("VALUE");
switch (str1){
case "NOT_SET": //NOT_SET HURISTIC_ONLY HURISTIC_AND_GRAYBOX
Global.DEFAULT_OPTION=SparkTuneOption.NOT_SET;
break;
case "HURISTIC_ONLY":
Global.DEFAULT_OPTION=SparkTuneOption.HURISTIC_ONLY;
break;
case "HURISTIC_AND_GRAYBOX":
Global.DEFAULT_OPTION=SparkTuneOption.HURISTIC_AND_GRAYBOX;
break;
case "GRAYBOX_ONLY":
Global.DEFAULT_OPTION=SparkTuneOption.GRAYBOX_ONLY;
break;
case "HYBIRD":
Global.DEFAULT_OPTION=SparkTuneOption.HYBIRD;
break;
}
break;
case "RECORD_LOG_INTO_HISTORY":
Global.RECORD_LOG_INTO_HISTORY=dataJson.getJSONObject(i).getBoolean("VALUE");
break;
case "RETRAIN_COUNT":
Global.RETRAIN_COUNT=dataJson.getJSONObject(i).getInt("VALUE");
break;
case "HYBIRD_PARAMETERS":
Global.HYBIRD_K=dataJson.getJSONObject(i).getDouble("K");
Global.HYBIRD_p=dataJson.getJSONObject(i).getDouble("p");
break;
case "SUBMIT_COMMAND_TO_SYSYTEM":
Global.SUBMIT_COMMAND_TO_SYSYTEM=dataJson.getJSONObject(i).getBoolean("VALUE");
break;
default:
System.err.println(filePath+"文件解析错误:未定义的属性:\t"+name);
break;
}
}
return;
}
}
Java
1
https://gitee.com/HITMassiveData/Spark-Autotuning.git
git@gitee.com:HITMassiveData/Spark-Autotuning.git
HITMassiveData
Spark-Autotuning
Spark-Autotuning
master

搜索帮助