JDBC数据源实战
Spark SQL支持使用JDBC从关系型数据库(比如MySQL)中读取数据。读取的数据,依然由DataFrame表示,可以很方便地使用Spark Core提供的各种算子进行处理。
实际上用Spark SQL处理JDBC中的数据是非常有用的。比如说,你的MySQL业务数据库中,有大量的数据,比如1000万,然后,你现在需要编写一个程序,对线上的脏数据某种复杂业务逻辑的处理,甚至复杂到可能涉及到要用Spark SQL反复查询Hive中的数据,来进行关联处理。
那么此时,用Spark SQL来通过JDBC数据源,加载MySQL中的数据,然后通过各种算子进行处理,是最好的选择。因为Spark是分布式的计算框架,对于1000万数据,肯定是分布式处理的。而如果你自己手工编写一个Java程序,那么不好意思,你只能分批次处理了,先处理2万条,再处理2万条,可能运行完你的Java程序,已经是几天以后的事情了。
数据准备
create database mytest;
use mytest;
create table student_infos(name varchar(20), age integer);
create table student_scores(name varchar(20), score integer);
insert into student_infos values('leo',18),('marry',17),('jack',19);
insert into student_scores values('leo',88),('marry',99),('jack',60);
create table good_student_infos(name varchar(20),age integer,score integer);
JDBC数据源实战
案例:查询分数大于80分的学生信息
Java版本
public class JDBCDataSource {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JDBCDataSourceJava").setMaster("local");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sparkContext);
// 分别将mysql中两张表的数据加载为DataFrame
Map<String, String> options = new HashMap<String, String>();
options.put("url", "jdbc:mysql://hadoop-100:3306/mytest");
options.put("dbtable", "student_infos");
options.put("user", "root");
options.put("password", "zhaojun2436");
DataFrame infoDF = sqlContext.read().options(options).format("jdbc").load();
options.put("dbtable", "student_scores");
DataFrame scoreDF = sqlContext.read().options(options).format("jdbc").load();
// 将两个DataFrame转换为JavaPairRDD,执行join操作
JavaPairRDD<String, Integer> infoRDD = infoDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<>(row.getString(0), row.getInt(1));
}
});
JavaPairRDD<String, Integer> scoreRDD = scoreDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<>(row.getString(0), row.getInt(1));
}
});
JavaPairRDD<String, Tuple2<Integer, Integer>> infoJoinScore = infoRDD.join(scoreRDD);
// 将JavaPairRDD转换为JavaRDD<Row>
JavaRDD<Row> infoJoinScoreRDD = infoJoinScore.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> v1) throws Exception {
return RowFactory.create(v1._1, v1._2._1, v1._2._2);
}
});
// 过滤出分数大于80分的数据
JavaRDD<Row> goodStudent = infoJoinScoreRDD.filter(new Function<Row, Boolean>() {
@Override
public Boolean call(Row v1) throws Exception {
if (v1.getInt(2) > 80) {
return true;
}
return false;
}
});
// 转换为DataFrame
List<StructField> fieldList = new ArrayList<>();
fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
fieldList.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
fieldList.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
StructType structType = DataTypes.createStructType(fieldList);
DataFrame df = sqlContext.createDataFrame(goodStudent, structType);
Row[] collect = df.collect();
for(Row row : collect) {
System.out.println(row);
}
// 将DataFrame中的数据保存到mysql表中
// 这种方式是在企业里很常用的,有可能是插入mysql、有可能是插入hbase,还有可能是插入redis缓存
goodStudent.foreach(new VoidFunction<Row>() {
@Override
public void call(Row row) throws Exception {
String sql = "insert into good_student_infos values("
+ "'" + String.valueOf(row.getString(0)) + "',"
+ Integer.valueOf(String.valueOf(row.get(1))) + ","
+ Integer.valueOf(String.valueOf(row.get(2))) + ")";
Class.forName("com.mysql.jdbc.Driver");
Connection conn = null;
Statement stmt = null;
try {
conn = DriverManager.getConnection(
"jdbc:mysql://hadoop-100:3306/mytest", "root", "zhaojun2436");
stmt = conn.createStatement();
stmt.executeUpdate(sql);
} catch (Exception e) {
e.printStackTrace();
} finally {
if(stmt != null) {
stmt.close();
}
if(conn != null) {
conn.close();
}
}
}
});
}
}
Scala版本
object JDBCDataSource {
def main(args: Array[String]): Unit = {
// 首先还是创建SparkConf
val conf = new SparkConf().setAppName("HiveDataSourceScala").setMaster("local")
// 创建SparkContext
val sparkContext = new SparkContext(conf)
val sqlContext = new SQLContext(sparkContext)
val info = sqlContext.read.format("jdbc").option("url", "jdbc:mysql://hadoop-100:3306/mytest").option("dbtable", "student_infos").option("user", "root").option("password", "zhaojun2436").load()
val score = sqlContext.read.format("jdbc").option("url", "jdbc:mysql://hadoop-100:3306/mytest").option("dbtable", "student_scores").option("user", "root").option("password", "zhaojun2436").load()
val infoRDD = info.rdd.map(row => (row.getString(0), row.getInt(1)))
val scoreRDD = score.rdd.map(row => (row.getString(0), row.getInt(1)))
val infoJoinScore = infoRDD.join(scoreRDD)
val goodStudent = infoJoinScore.filter(f => {
if (f._2._2 > 80) true
else false
})
goodStudent.foreach(f => {
val dbc = "jdbc:mysql://hadoop-100:3306/mytest?user=root&password=zhaojun2436"
classOf[com.mysql.jdbc.Driver]
val conn = DriverManager.getConnection(dbc)
val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE)
// do database insert
try {
val prep = conn.prepareStatement("INSERT INTO good_student_infos VALUES (?, ?, ?) ")
prep.setString(1, f._1)
prep.setInt(2, f._2._1)
prep.setInt(3, f._2._2)
prep.executeUpdate
} catch{
case e:Exception =>e.printStackTrace
}
finally {
conn.close
}
})
}
}