{
"颜色标签": {
"颜色": ["黑色", "白色", "红色", "蓝色", "绿色", "紫色"],
"尺寸": ["超大", "大"]
},
"颜色标签2": {
"颜色2": ["黑色", "白色", "红色", "蓝色", "绿色", "紫色", "黄色"],
"尺寸2": ["超大", "大"]
}
}
问题
使用SparkSQL将上述的数据根据嵌套提取成3列
解答
with base_tb as (
select '{
"颜色标签": {
"颜色": ["黑色", "白色", "红色", "蓝色", "绿色", "紫色"],
"尺寸": ["超大", "大"]
},
"颜色标签2": {
"颜色2": ["黑色", "白色", "红色", "蓝色", "绿色", "紫色", "黄色"],
"尺寸2": ["超大", "大"]
}
}' as col
)
select key, inner_key, json_tuple(inner_json, inner_key) js
from (
select key, json_object_keys(inner_json) inner_keys, inner_json
from (
select key, json_tuple(col, key) inner_json
from (
select json_object_keys(col) keys, col
from base_tb
) as a
LATERAL view explode(keys) as key
) as a
) as a
LATERAL view explode(inner_keys) as inner_key
主要采用Spark 3.1.0新增的json_object_keys
函数拿到key值,然后依次提取,最后输出如下:
key | inner_key | js |
---|---|---|
颜色标签 | 颜色 | ["黑色", "白色", "红色", "蓝色", "绿色", "紫色"] |
颜色标签 | 尺寸 | ["超大", "大"] |
颜色标签2 | 颜色2 | ["黑色", "白色", "红色", "蓝色", "绿色", "紫色"] |
颜色标签2 | 尺寸2 | ["超大", "大"] |