2022-02-28 hive数据库表操作

-- 二、##################数据库表操作##################

-- 1、hive的数据类型
/*
   int      整形
   double   浮点型
   decimal  任意精度的带符号小数
   string   字符串 (MySQL是varchar)
   date     年月日（2022-01-26）
   time     时分秒 (12:34:56)
   datetime 年月日  时分秒  2022-01-26 12:34:56）

   复杂类型：
     Array   数组
     Map     集合
     Struct  对象
 */

-- 2、创建表
/*
    1:在hive中创建表，会在使用的数据库hdfs目录下创建表目录
    2:在Hive中，表可以分为两大类
      内部表(管理表):
        1)创建表时不添加 external关键字
        2)是私有表，删除表时，HDFS上的表数据和MySQL元数据全部删除

      外部表：
        1)创建表时添加 external关键字
        2)是公有表，删除表时，只会删除MySQL元数据，而HDFS上的表数据不会删除

 */
----------------------------内部表操作----------------------------
-- 2.1 创建内部表
set hive.stats.column.autogather=false;
set hive.exec.mode.local.auto=true; -- 开启本地模式（临时开启）

create database if not exists myhive;
use myhive;

create table stu
(
    id   int,
    name string
);

-- 实际开发中不用insert into
-- 只是用于测试，每执行一次就会调用MR，并在HDFS上生成小文件
insert into stu
values (1, "赢驷");
insert into stu
values (1, "赢荡");

select *
from stu;

-- 2.2 验证内部表的特性
-- 删除内部表，其实本质是将HDFS上的文件和元数据删除
-- 如果开启了垃圾桶，则HDFS上的文件暂时被存放到垃圾桶中
drop table stu;

-- 2.3 创建表并指定分隔符
/*
   1:Hive默认识别的分隔符是 '\001' ,这个分隔符是不可显示的，但是Hive可以识别
 */
create table if not exists stu2
(
    id   int,
    name string
) row format delimited fields terminated by ',';

insert into stu2
values (1, '李世民');

select *
from stu2;

-- 2.4 根据另外一张表来创建新表
-- 1：stu3复制了stu2的表结构
-- 2：stu3复制了stu2的表数据
create table stu3 as
select *
from stu2;

select *
from stu3;

-- 2.5 复制另外一张表的表结构
create table stu4 like stu2;

select *
from stu4;

-- 2.6 查看一张表结构
-- 操作1：查看基本字段
desc stu2;
-- 操作2：查看详细信息
/*
   Table Type:
        MANAGED_TABLE:表示该表是管理表，也就是内部表
 */
desc formatted stu2;

-- 2.7 给内部表添加数据
-- 方式1：使用hadoop fs -put 将数据文件上传到表目录文件夹
create table stu5
(
    id   int,
    name string
) row format delimited fields terminated by ',';

select *
from stu5;

-- 方式2：load命令来加载
/*
   load data [local] inpath '/export/data/datas/student.txt'
   [overwrite] | into table student [partition (partcol1=val1,…)];
 */
-- 操作1-本地加载
-- 本地加载本质上是将本地的文件上传到hdfs对应的表目录，做的是复制
create table stu6
(
    id   int,
    name string
) row format delimited fields terminated by ',';

-- 本地有local
load data local inpath '/export/data/stu.txt' into table stu6;

select *
from stu6;

-- 操作2-hdfs加载
-- hdfs加载，本质是将hdfs的文件从源目录剪切到表目录
-- hdfs无local
load data inpath '/hive/data/stu.txt' into table stu6;
select *
from stu6;

----------------------------外部表操作----------------------------
-- 2.1 创建外部表
-- 1）创建教师表
create external table teacher
(
    tid   string,
    tname string
) row format delimited fields terminated by '\t';

-- 2)创建学生表
create external table student
(
    sid    string,
    sname  string,
    sbirth string,
    ssex   string
) row format delimited fields terminated by '\t';

-- 2.2 给student表添加数据-本地加载
-- 追加添加（无overwrite）
load data local inpath '/export/data/student.txt' into table student;
select *
from student;

-- 覆盖添加（有overwrite）
load data local inpath '/export/data/student.txt' overwrite into table student;
select *
from student;

-- 2.3 给teacher表添加数据-hdfs加载
/*
    准备工作:
     hadoop fs -put teacher.txt /hive/data/
     将teacher.txt 上传到 /hive/data/目录
*/
-- 追加添加（无overwrite） 【剪切】
load data inpath '/hive/data/teacher.txt' into table teacher;
select *
from teacher;

-- 覆盖添加（有overwrite）
load data inpath '/hive/data/teacher.txt' overwrite into table teacher;
select *
from teacher;

-- 2.4 删除外部表
-- 只删除元数据，不删除表数据
drop table teacher;
-- 表已经不存在了
select *
from teacher;

-- 重新创建表（修改映射关系）
create external table teacher
(
    tid   string,
    tname string
) row format delimited fields terminated by '\t';
-- 重新查询到数据
select *
from teacher;

-- 2.5 多张外部表共享一份表数据
/*
   1:现有一份数据已经存储在hdfs上，而且不想移动位置
   2:创建表和hdfs上的数据之间产生映射关系
 */

-- 创建covid1表
create external table covid1
(
    date_value date,
    county     string,
    state      string,
    fips       string,
    cases      int,
    deaths     int
)
    row format delimited fields terminated by ','
    location '/hive/data/covid'; -- hdfs目录

select *
from covid1
limit 10;

-- TODO idea报错，linux hive终端执行成功（可能是idea问题），待解决
select count(*)
from covid1;

-- 创建covid2表 和covid共享同一份文件
create external table covid2
(
    date_value date,
    county     string,
    state      string,
    fips       string,
    cases      int,
    deaths     int
)
    row format delimited fields terminated by ','
    location '/hive/data/covid';

select *
from covid2
limit 20;

select count(*)
from covid2;

-- 删除表
drop table covid1;
-- 删除covid1不会影响covid2的查询
select *
from covid2
limit 20;


----------------------------复杂数据类型----------------------------
-- 1、Array类型
/*
zhangsan    beijing,shanghai,tianjin,hangzhou
wangwu  changchun,chengdu,wuhan,beijing
 */
-- 创建表
create external table hive_array
(
    name           string,
    work_locations array<string>
)
    row format delimited fields terminated by '\t'
        collection items terminated by ',';

load data local inpath '/export/data/array.txt' into table hive_array;

select *
from hive_array;

-- 查询work_locations数组中第一个元素
select name, work_locations[0]
from hive_array;

-- 查询location数组中元素的个数
select name, size(work_locations)
from hive_array;

-- 查询location数组中包含tianjin的信息
select *
from hive_array
where array_contains(work_locations, 'tianjin');


-- 2、Map类型
/*
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
 */
create external table hive_map
(
    id      int,
    name    string,
    members map<string,string>,
    age     int
)
    row format delimited fields terminated by ','
        collection items terminated by '#'
        map keys terminated by ':';

load data local inpath '/export/data/map.txt' into table hive_map;

select *
from hive_map;

-- 根据键找对应的值
select id,
       name,
       members['father'] as dad, -- as dad:起别名(as可省略)
       members['mother'] as mom
from hive_map;

-- 获取所有的键
select id,
       name,
       map_keys(members) as relation
from hive_map;

-- 获取所有的值
select id,
       name,
       map_values(members) as relation
from hive_map;

-- 获取键值对个数
select id,
       name,
       size(members) size
from hive_map;

-- 获取有指定key的数据
select *
from hive_map
where array_contains(map_keys(members), 'brother');

-- 查找包含brother这个键的数据，并获取brother键对应的值
select id,
       name,
       members['brother'] brother
from hive_map
where array_contains(map_keys(members), 'brother');

-- struct架构
/*
192.168.1.1#zhangsan:40:male
192.168.1.2#lisi:50:female
192.168.1.3#wangwu:60:male
192.168.1.4#zhaoliu:70:female
 */
create external table hive_struct
(
    ip     string,
    person struct<name:string,age:int,gender:string>
)
    row format delimited fields terminated by '#'
        collection items terminated by ":";

load data local inpath '/export/data/struct.txt' into table hive_struct;

select *
from hive_struct;

-- 根据struct来获取指定的成员的值
select ip,
       person.name
from hive_struct;

select ip,
       person.name,
       person.age,
       person.gender
from hive_struct;

select ip,
       person.name,
       person.age,
       person.gender
from hive_struct
where person.name = 'wangwu';

----------------------分区表操作----------------------------
/*
    采集数据:
      采集周期: 1天
      存放目录: /logs
           2022_01_01.dat
           2022_01_02.dat
           2022_01_03.dat
           ...
           2022_02_01.dat
           2022_02_02.dat
           2022_02_03.dat
           ...

           2023_01_01.dat
           2023_01_02.dat
           2023_01_03.dat
           ...
           2023_02_01.dat
           2023_02_02.dat
           2023_02_03.dat
           ...

     存放目录:/logs
      year=2022
         month=01
           2022_01_01.dat
           2022_01_02.dat
           2022_01_03.dat
         month=02
           2022_02_01.dat
           2022_02_02.dat
           2022_02_03.dat
           ...
      year=2023
         month=01
           2023_01_01.dat
           2023_01_02.dat
           2023_01_03.dat
         month=02
           2023_02_01.dat
           2023_02_02.dat
           2023_02_03.dat

    1)分区表就是将表数据文件进行分类管理
    2)分区表表现形式就是分文件夹
    3)这里的分区和MapReduce没有关系
    4)分区表可以极大的调高数据查询效率（分区字段可以加在where条件中）
 */

-- 1、创建分区表

/*
    静态分区：
       所有的分区的值需要手动指定
    动态分区：
      所有的分区的值自动生成
 */

----------------单级分区表(一级文件夹)---------------
create external table score
(
    sid    string, -- 学号
    cid    string, -- 学科id
    sscore int     -- 成绩
)
    -- 指定一个分区字段，理论上这个分区子字段可以随便写，分区字段和表字段没有关系
    partitioned by (month string)
    row format delimited fields terminated by '\t';

/*
    month=01
      score.txt
    month=02
      score.txt
 */

-- 2、给分区表添加数据
-- /user/hive/warehouse/myhive.db/score/month=202006
load data local inpath '/export/data/score.txt' into table score partition (month = '202006');

-- /user/hive/warehouse/myhive.db/score/month=202007
load data local inpath '/export/data/score.txt' into table score partition (month = '202007');

-- /user/hive/warehouse/myhive.db/score/month=202008
load data local inpath '/export/data/score.txt' into table score partition (month = '202008');

select *
from score;

select count(*)
from score;

-- 3.1 条件查询 - 只查询6月份的月考成绩
select *
from score
where month = '202006';

-- 3.1 条件查询 - 只查询6月份或7月份成绩
select *
from score
where month = '202006'
   or month = '202007';

select *
from score
where month in ('202006', '202007');


----------------多级分区表(多级文件夹)---------------
create external table score2
(
    sid    string,
    cid    string,
    sscore int
)
    partitioned by (year string,month string,day string)
    row format delimited fields terminated by '\t';

-- 2022
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2022',month = '01',day = '01');
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2022',month = '01',day = '02');

load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2022',month = '02',day = '01');
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2022',month = '02',day = '02');

-- 2023
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2023',month = '01',day = '01');
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2023',month = '01',day = '02');

load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2023',month = '02',day = '01');
load data local inpath '/export/data/score.txt'
    into table score2 partition (year = '2023',month = '02',day = '02');

select *
from score2;

-- 查询指定时间的成绩(2022年成绩)
select *
from score2
where year = '2022';

-- 查询指定时间的成绩(2023年02月成绩)
select *
from score2
where year = '2023'
  and month = '02';

-- 查询指定时间的成绩(2023年02月02日成绩)
select *
from score2
where year = '2023'
  and month = '02'
  and day = '02';

--- union all,将两个表的结果上下拼接在一起（和join不同，join是左右拼接）
explain -- 查看表执行流程
select *
from score
where month = '202006'
union all
select *
from score
where month = '202007';

select *
from score
where month = '202006'
   or month = '202007';

-- 查看分区信息
show partitions score;
show partitions score2;

-- 查看表结构，包含分区信息
desc score;
desc score2;
desc formatted covid2;

-- 添加分区
alter table score
    add partition (month = '202009');

alter table score
    add partition (month = '202010')
        partition (month = '202011');

alter table score2
    add partition (year = '2024',month = '01',day = '01')
        partition (year = '2024',month = '01',day = '02');

-- 删除分区(TODO hdfs上对应文件夹没有删除)
alter table score
    drop partition (month = '202010');


---------------------动态分区----------------------------
-- 1、开启动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

-- 中间表
create external table test1
(
    id       int,
    date_val string,
    name     string,
    score    int
)
    row format delimited fields terminated by ',';

load data local inpath '/export/data/hivedata/partition_test1.txt' into table test1;

select *
from test1;

-- 3、创建最终分区表
create table test2
(
    id    int,
    name  string,
    score int
)
    partitioned by (xxx string)
    row format delimited fields terminated by ',';

-- 4、查询普通表数据查询并插入分区表，在插入的过程中运行MapReduce，进行动态分区
insert overwrite table test2 partition (xxx)
select id,
       name,
       score,
       date_val -- 动态分区就是根据select的最后一个字段来进行分区的
-- 中间普通表
from test1;



-------------------二级动态分区------------------------
-- 1、创建普通表
create table test3
(
    id       int,
    date_val string,
    name     string,
    sex      string,
    score    int
)
    row format delimited fields terminated by ',';

load data local inpath '/export/data/hivedata/partition_test2.txt' into table test3;

-- 2、创建分区表
create table test4
(
    id    int,
    name  string,
    score int
)
    partitioned by (xxx string, yyy string)
    row format delimited fields terminated by ',';

-- 3、将普通的表数据进行查询，插入到目标分区表，插入时会自动执行MapReduce，完成动态分区
insert overwrite table test4 partition (xxx, yyy)
select id,
       name,
       score,
       date_val,
       sex -- 这里的分区本质是看select的最后两个字段
from test3;

select *
from test4
where xxx = '202106'
  and yyy = 'man';

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 213,014评论 6赞 492
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 90,796评论 3赞 386
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 158,484评论 0赞 348
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 56,830评论 1赞 285
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 65,946评论 6赞 386
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 50,114评论 1赞 292
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 39,182评论 3赞 412
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 37,927评论 0赞 268
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,369评论 1赞 303
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 36,678评论 2赞 327
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,832评论 1赞 341
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 34,533评论 4赞 335
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,166评论 3赞 317
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,885评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,128评论 1赞 267
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 46,659评论 2赞 362
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 43,738评论 2赞 351

2022-02-28 hive数据库表操作

推荐阅读更多精彩内容