工作一年多了,一直在断断续续的学习使用 Hadoop,它提供的 Map 和 Reduce 数据处理引擎能够帮助我们方便的处理大数据集,HDFS 分布式文件系统,可以帮助我们冗余的存储大数据集,这么好的一门技术,应该是要好好学习的。
最近一直在看 <MapReduce设计模式> 一书,里面介绍了许多问题的解决办法,总觉得读书应该记些笔记,这里就写点什么吧!
首先,需要一个数据集用于以后的数据处理算法做准备:
- 这个数据集包含10000行数据,其中的每一行都是 json 字符串
- 每个 json 字符串中包含一个用户的四个基本信息(id, name, sex, age)
接来下,生成这个数据集:
package hadoop_design.mock_user_info;
/**
* User 基本信息
* Created by zhanghu on 16/8/27.
*/
public class UserBean {
private String userName;
private int age;
private int sex; // 男性是1, 女性是0
private String id; // md5(userName + age + sex)
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public int getSex() {
return sex;
}
public void setSex(int sex) {
this.sex = sex;
}
public String getId() {
return id;
}
public void setId(String userId) {
this.id = userId;
}
}
package hadoop_design.mock_user_info;
import net.sf.json.JSONObject;
/**
* Json 工具类
* Created by zhanghu on 16/8/27.
*/
public class JsonUtils {
public static String objectToJsonString(Object object) {
JSONObject json = JSONObject.fromObject(object);
return json.toString();
}
}
package hadoop_design.mock_user_info;
import java.util.Random;
/**
* 随机器
* 引用 : http://www.cnblogs.com/dongliyang/archive/2013/04/01/2994554.html
* Created by zhanghu on 16/8/27.
*/
public final class StdRandom {
//随机数生成器
private static Random random;
//种子值
private static long seed;
//静态代码块,初始化种子值及随机数生成器
static {
seed = System.currentTimeMillis();
random = new Random(seed);
}
//私有构造函数,禁止实例化
private StdRandom() {}
/**
* 设置种子值
* @param s 随机数生成器的种子值
*/
public static void setSeed(long s){
seed = s;
random = new Random(seed);
}
/**
* 获取种子值
* @return long 随机数生成器的种子值
*/
public static long getSeed(){
return seed;
}
/**
* 随机返回0到1之间的实数 [0,1)
* @return double 随机数
*/
public static double uniform(){
return random.nextDouble();
}
/**
* 随机返回0到N-1之间的整数 [0,N)
* @param N 上限
* @return int 随机数
*/
public static int uniform(int N){
return random.nextInt(N);
}
/**
* 随机返回0到1之间的实数 [0,1)
* @return double 随机数
*/
public static double random(){
return uniform();
}
/**
* 随机返回a到b-1之间的整数 [a,b)
* @param a 下限
* @param b 上限
* @return int 随机数
*/
public static int uniform(int a,int b){
return a + uniform(b - a);
}
/**
* 随机返回a到b之间的实数
* @param a 下限
* @param b 上限
* @return double 随机数
*/
public static double uniform(double a,double b){
return a + uniform() * (b - a);
}
}
package hadoop_design.mock_user_info;
/**
* String 对象的一些工具类
* Created by zhanghu on 16/8/27.
*/
public class StringUtils {
/**
* 返回随机字符串,同时包含数字、大小写字母
* @param len 字符串长度,不能小于3
* @return String 随机字符串
*/
public static String randomStr(int len){
if(len < 3){
throw new IllegalArgumentException("字符串长度不能小于3");
}
//数组,用于存放随机字符
char[] chArr = new char[len];
//为了保证必须包含数字、大小写字母
chArr[0] = (char)('0' + StdRandom.uniform(0,10));
chArr[1] = (char)('A' + StdRandom.uniform(0,26));
chArr[2] = (char)('a' + StdRandom.uniform(0,26));
char[] codes = { '0','1','2','3','4','5','6','7','8','9',
'A','B','C','D','E','F','G','H','I','J',
'K','L','M','N','O','P','Q','R','S','T',
'U','V','W','X','Y','Z','a','b','c','d',
'e','f','g','h','i','j','k','l','m','n',
'o','p','q','r','s','t','u','v','w','x',
'y','z'};
//charArr[3..len-1]随机生成codes中的字符
for(int i = 3; i < len; i++){
chArr[i] = codes[StdRandom.uniform(0,codes.length)];
}
//将数组chArr随机排序
for(int i = 0; i < len; i++){
int r = i + StdRandom.uniform(len - i);
char temp = chArr[i];
chArr[i] = chArr[r];
chArr[r] = temp;
}
return new String(chArr);
}
}
package hadoop_design.mock_user_info;
import org.apache.commons.codec.digest.DigestUtils;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
/** 主程序代码
* Created by zhanghu on 16/8/27.
*/
public class Main {
private static String generateData() {
UserBean userBean = new UserBean();
userBean.setUserName(StringUtils.randomStr(StdRandom.uniform(10, 21)));
userBean.setAge(StdRandom.uniform(0, 100));
userBean.setSex(StdRandom.uniform(0, 2));
String md5Bean = userBean.getUserName() + userBean.getAge() + userBean.getSex();
userBean.setId(DigestUtils.md5Hex(md5Bean));
return JsonUtils.objectToJsonString(userBean);
}
public static void main(String[] args) throws IOException {
File file = new File("user.data");
BufferedWriter out = new BufferedWriter(new FileWriter((file)));
for (int i = 0; i != 10000; ++i) {
if (i % 1000 == 0) {
System.out.println("mock data line : " + i);
}
out.write(generateData());
out.newLine();
}
out.flush();
out.close();
}
}
OK, 利用上面的程序,我得到了一个包含 10000 行用户信息的 json 文本行,类似于下面这样:
{"age":48,"id":"7a8bd2dc862f8ce972292474f2f3bc56","sex":1,"userName":"dHI3w56HNTiQh"}
{"age":18,"id":"fbcf2df050aa2da3c678dcb0a02bda2d","sex":1,"userName":"Xh7mU53Ba7JZ"}
{"age":70,"id":"4808f32ecbbe21b93882bb44973e7bea","sex":1,"userName":"aLV5E156YdJ"}
{"age":57,"id":"a9863ef325a6ca91f2554e8f4874d424","sex":1,"userName":"CwQ43w548IS"}
{"age":71,"id":"e4e7724632feefc514902d0849a86d6b","sex":1,"userName":"sz9hcdCZnkVXC3x"}
{"age":26,"id":"25ae8b3ab30f11a267939fec7177f829","sex":1,"userName":"cA17IpnzzPFMv4"}
{"age":58,"id":"52a7cb852583fe183d300de9c6de0efa","sex":1,"userName":"9b8v3HFIaNqsIyC2a97"}
{"age":55,"id":"71fd9c057f2c60b99021ce8a353c5cb0","sex":0,"userName":"9OqAJlyZVKgpV"}
{"age":25,"id":"6b758fb1a4e78930ea919074d5abb172","sex":0,"userName":"OvHcn61daoXTu"}
{"age":90,"id":"02883e46b66fd848075401ae205b0896","sex":0,"userName":"4kyHB1s5v6nQ049"}
下面,开始使用这些数据吧!