hive性能调优 读书笔记 - 调优多样性(改写sql、数据块大小、格式、分区、分桶)
learn from 《Hive性能调优实战》
本文使用的 hive 版本 2.3.2
1. 生成数据
生成 student 数据脚本
import random
import datetime
# lastname和first都是为了来随机构造名称
lastname = u"赵李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗"
firstname = u"红尘冷暖岁月清浅仓促间遗落一地如诗的句点不甘愿不决绝掬一份刻骨的思念系一根心的挂牵在你回眸抹兰轩的底色悄然"
#创建一个函数,参数start表示循环的批次
def create_student_dict(start):
firstlen = len(firstname)
lastlen = len(lastname)
# 创建一个符合正太分布的分数队列
scoreList = [int(random.normalvariate(100, 50)) for _ in range(1, 5000)]
# 创建1万条记录,如果执行程序内存够大这个可以适当调大
filename = str(start) + '.txt'
print(filename)
#每次循环都创建一个文件,文件名为:循环次数+'.txt',例如 1.txt
with open('./' + filename, mode='w') as fp:
for i in range(start * 40000, (start + 1) * 40000):
firstind = random.randint(1, firstlen - 4)
model = {"s_no": u"xuehao_no_" + str(i),
"s_name": u"{0}{1}".format(lastname[random.randint(1, lastlen - 1)],
firstname[firstind: firstind + 1]),
"s_birth": u"{0}-{1}-{2}".format(random.randint(1991, 2000),
'0' + str(random.randint(1, 9)),
random.randint(10, 28)),
"s_age": random.sample([20, 20, 20, 20, 21, 22, 23, 24, 25, 26], 1)[0],
"s_sex": str(random.sample(['男', '女'], 1)[0]),
"s_score": abs(scoreList[random.randint(1000, 4990)]),
's_desc': u"为程序猿攻城狮队伍补充新鲜血液,"
u"为祖国未来科技产业贡献一份自己的力量" * random.randint(1, 20)}
#写入数据到本地文件
fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".
format(model['s_no'], model['s_name'],
model['s_birth'], model['s_age'],
model['s_sex'], model['s_score'],
model['s_desc']))
# 循环创建记录,一共是40000*500=2千万的数据
for i in range(1, 501):
starttime = datetime.datetime.now()
create_student_dict(i)
会生成500个数据文件
2. 加载表
hdfs dfs -mkdir -p /opt/hive_learning/warehouse/student_tb_txt/
创建hdfs目录hdfs dfs -put ./init_student/*txt /opt/hive_learning/warehouse/student_tb_txt/
数据放入
create table if not exists default.student_tb_txt(
s_no string comment '学号',
s_name string comment '姓名',
s_birth string comment '生日',
s_age bigint comment '年龄',
s_sex string comment '性别',
s_score bigint comment '综合能力得分',
s_desc string comment '自我介绍')
row format delimited
fields terminated by '\t'
location '/opt/hive_learning/warehouse/student_tb_txt/';
hive -f /opt/bdp/data01/ch1/create_tb1.sql
执行以上 sql
同样的再生成另一个 course 数据表
import random, datetime
import sys
# 创建一个函数,参数start表示循环的批次
def create_student_sc_dict(start):
filename = str(start) + '.txt'
print(start)
with open('./' + filename, mode='w') as fp:
for i in range(start * 40000, (start + 1) * 40000):
# 课程出现越多表示喜欢的人越多
course = random.sample([u'数学', u'数学', u'数学', u'数学', u'数学',
u'语文', u'英语', u'化学', u'物理', u'生物'], 1)[0]
model = {"s_no": u"xuehao_no_" + str(i),
"course": u"{0}".format(course),
"op_datetime": datetime.datetime.now().strftime("%Y-%m-%d"),
"reason": u"我非常非常非常非常非常非常非常"
u"非常非常非常非常非常非常非常喜爱{0}".format(course)}
line = "{0}\t{1}\t{2}\t{3}\n" \
.format(model['s_no'],
model['course'],
model['op_datetime'],
model['reason'])
fp.write(line)
# 循环创建记录,一共是40000*500=2千万记录
for i in range(1, 501):
starttime = datetime.datetime.now() # create_student_dict 转换成dataframe格式,并注册临时表temp_student
create_student_sc_dict(i)
hdfs dfs -mkdir -p /opt/hive_learning/warehouse/student_sc_tb_txt/
hdfs dfs -put ./init_course/*txt /opt/hive_learning/warehouse/student_sc_tb_txt/
create table if not exists default.student_sc_tb_txt(
s_no string comment '学号',
course string comment '课程名',
op_datetime string comment '操作时间',
reason string comment '选课原因')
row format delimited
fields terminated by '\t'
location '/opt/hive_learning/warehouse/student_sc_tb_txt/';
- 执行sql
hive -f /opt/bdp/data01/ch1/create_tb2.sql
检查数据
hive> select count(*) from student_tb_txt;
OK
20000000
hive> select * from student_tb_txt limit 2;
OK
xuehao_no_40000 蒋不 1995-03-24 20 女 125 为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量
xuehao_no_40001 鲁眸 1999-06-16 23 男 29 为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量为程序猿攻城狮队伍补充新鲜血液,为祖国未来科技产业贡献一份自己的力量
Time taken: 5.596 seconds, Fetched: 2 row(s)
hive> select count(*) from student_sc_tb_txt;
OK
20000000
Time taken: 9.364 seconds, Fetched: 1 row(s)
hive> select * from student_sc_tb_txt limit 2;
OK
xuehao_no_40000 数学 2023-02-23 我非常非常非常非常非常非常非常非常非常非常非常非常非常非常喜爱数学
xuehao_no_40001 数学 2023-02-23 我非常非常非常非常非常非常非常非常非常非常非常非常非常非常喜爱数学
Time taken: 0.191 seconds, Fetched: 2 row(s)
3. union 例子
DROP TABLE if EXISTS student_stat;
--创建student_stat
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS TEXTFILE;
--开启动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT into table student_stat partition(tp)
select s_age, max(s_birth) stat, 'max' tp from student_tb_txt
group by s_age
union all
select s_age, min(s_birth) stat, 'min' tp from student_tb_txt
group by s_age;
执行结果
Query ID = root_20230223034526_15e51a06-9772-4b78-b1e6-a4cae6fb688f
Total jobs = 5
Launching Job 1 out of 5
Number of reduce tasks not specified. Estimated from input data size: 86
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2023-02-23 03:45:33,821 Stage-1 map = 0%, reduce = 0%
2023-02-23 03:45:36,999 Stage-1 map = 100%, reduce = 0%
2023-02-23 03:46:37,908 Stage-1 map = 100%, reduce = 0%
2023-02-23 03:47:38,211 Stage-1 map = 100%, reduce = 0%
2023-02-23 03:48:33,167 Stage-1 map = 100%, reduce = 1%
2023-02-23 03:48:34,184 Stage-1 map = 100%, reduce = 100%
2023-02-23 03:48:40,372 Stage-1 map = 100%, reduce = 94%
2023-02-23 03:48:41,392 Stage-1 map = 100%, reduce = 100%
Ended Job = job_local1201441739_0001
Launching Job 2 out of 5
Number of reduce tasks not specified. Estimated from input data size: 86
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2023-02-23 03:48:43,061 Stage-9 map = 0%, reduce = 0%
2023-02-23 03:48:45,524 Stage-9 map = 100%, reduce = 0%
2023-02-23 03:49:45,668 Stage-9 map = 100%, reduce = 0%
2023-02-23 03:50:46,582 Stage-9 map = 100%, reduce = 0%
2023-02-23 03:51:45,862 Stage-9 map = 99%, reduce = 0%
2023-02-23 03:51:46,883 Stage-9 map = 100%, reduce = 100%
2023-02-23 03:51:47,900 Stage-9 map = 100%, reduce = 26%
2023-02-23 03:51:48,920 Stage-9 map = 100%, reduce = 44%
2023-02-23 03:51:50,000 Stage-9 map = 100%, reduce = 100%
Ended Job = job_local382327797_0002
Launching Job 3 out of 5
Number of reduce tasks is set to 0 since there's no reduce operator
Job running in-process (local Hadoop)
2023-02-23 03:51:55,755 Stage-2 map = 100%, reduce = 0%
Ended Job = job_local175557105_0003
Stage-5 is selected by condition resolver.
Stage-4 is filtered out by condition resolver.
Stage-6 is filtered out by condition resolver.
Moving data to directory hdfs://namenode:8020/user/hive/warehouse/student_stat/.hive-staging_hive_2023-02-23_03-45-26_511_1911436748613982869-1/-ext-10000
Loading data to table default.student_stat partition (tp=null)
Loaded : 2/2 partitions.
Time taken to load dynamic partitions: 0.692 seconds
Time taken for adding to write entity : 0.004 seconds
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 2829597299021 HDFS Write: 0 SUCCESS
Stage-Stage-9: HDFS Read: 6562633817501 HDFS Write: 0 SUCCESS
Stage-Stage-2: HDFS Read: 87836153376 HDFS Write: 375 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 390.35 seconds
5个job,390s
优化建议
减少产生 中间数据 --> 减少 磁盘/网络 IO 时间,减少 job 数量,就是减少 MapReduce 作业(减少数据经历的磁盘读写和网络通讯)
以下在一个job中完成 min,max 的查询
DROP TABLE if EXISTS student_stat;
--创建student_stat
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS TEXTFILE;
--开启动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
from student_tb_txt
INSERT into table student_stat partition(tp)
select s_age, min(s_birth) stat,'min' tp
group by s_age
insert into table student_stat partition(tp)
select s_age, max(s_birth) stat, 'max' tp
group by s_age;
Query ID = root_20230223040251_e7b50f6d-b2af-4e3c-affe-97609daceedb
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 86
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2023-02-23 04:02:55,187 Stage-2 map = 0%, reduce = 0%
。。。
2023-02-23 04:07:55,203 Stage-2 map = 100%, reduce = 98%
2023-02-23 04:07:56,237 Stage-2 map = 100%, reduce = 100%
Ended Job = job_local1914433427_0001
Loading data to table default.student_stat partition (tp=null)
Loaded : 1/1 partitions.
Time taken to load dynamic partitions: 0.551 seconds
Time taken for adding to write entity : 0.003 seconds
Loading data to table default.student_stat partition (tp=null)
Loaded : 1/1 partitions.
Time taken to load dynamic partitions: 0.135 seconds
Time taken for adding to write entity : 0.001 seconds
MapReduce Jobs Launched:
Stage-Stage-2: HDFS Read: 2829597299021 HDFS Write: 91698 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 306.453 seconds
1个job,306s,比上面 390s 提升了 21.5%
再改写sql,将上面的 union 删掉,分别进行两次计算,看似少了 union, 会节省时间?
drop table if exists student_stat;
create table student_stat
(
a bigint,
b bigint
) partitioned by (tp string) STORED AS TEXTFILE;
set
hive.exec.dynamic.partition=true;
set
hive.exec.dynamic.partition.mode=nonstrict;
--计算max值
insert into table student_stat partition(tp)
select s_age, max(s_birth) stat, 'max' tp
from student_tb_txt
group by s_age;
--计算min值
insert into table student_stat partition(tp)
select s_age, min(s_birth) stat, 'min' tp
from student_tb_txt
group by s_age;
Query ID = root_20230223041353_58ffcacc-7405-4687-b79c-0a8576584fdd
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 86
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2023-02-23 04:14:00,670 Stage-1 map = 0%, reduce = 0%
。。。
2023-02-23 04:17:11,809 Stage-1 map = 100%, reduce = 100%
Ended Job = job_local1693620468_0001
Loading data to table default.student_stat partition (tp=null)
Loaded : 1/1 partitions.
Time taken to load dynamic partitions: 0.631 seconds
Time taken for adding to write entity : 0.003 seconds
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 2829597299021 HDFS Write: 45849 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 199.423 seconds
Query ID = root_20230223041713_2d2da63f-275f-47d4-b9cc-3ff83399e175
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 86
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2023-02-23 04:17:15,068 Stage-1 map = 0%, reduce = 0%
。。。
2023-02-23 04:20:30,141 Stage-1 map = 100%, reduce = 100%
Ended Job = job_local1164474387_0002
Loading data to table default.student_stat partition (tp=null)
Loaded : 1/1 partitions.
Time taken to load dynamic partitions: 0.089 seconds
Time taken for adding to write entity : 0.0 seconds
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 6562633943131 HDFS Write: 178619 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 197.531 seconds
共2个job,397s,不比上面的写法快
4. 调整数据块大小的影响
set hive.merge.mapfiles=true;
set hive.merge.orcfile.stripe.level=true;
set hive.merge.size.per.task=268435456;
set hive.merge.smallfiles.avgsize=16777216;
create table student_tb_orc like student_tb_txt stored as orc;
insert into student_tb_orc
select * from student_tb_txt;
create table student_tb_txt_bigfile like student_tb_txt stored as textfile;
insert into student_tb_txt_bigfile
select * from student_tb_orc;
生成了20多G的单个数据文件
hdfs dfs -ls -R -h hdfs://namenode:8020/user/hive/warehouse/student_tb_txt_bigfile
-rwxrwxr-x 3 root supergroup 20.5 G 2023-02-23 04:51 hdfs://namenode:8020/user/hive/warehouse/student_tb_txt_bigfile/000000_0
再执行sql
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS TEXTFILE;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
--业务过程同1.3一样计算各个年龄段,最小出生日期和最大出生日期
from student_tb_txt_bigfile
INSERT into table student_stat partition(tp)
select s_age, min(s_birth) stat, 'min' stat
GROUP by s_age
insert into table student_stat partition(tp)
select s_age, max(s_birth) stat, 'max' stat
GROUP by s_age;
比较表的信息:
hive> desc formatted student_tb_txt;
OK
# col_name data_type comment
s_no string ������
s_name string ������
s_birth string ������
s_age bigint ������
s_sex string ������
s_score bigint ������������������
s_desc string ������������
# Detailed Table Information
Database: default
Owner: root
CreateTime: Thu Feb 23 03:09:07 UTC 2023
LastAccessTime: UNKNOWN
Retention: 0
Location: hdfs://namenode:8020/opt/hive_learning/warehouse/student_tb_txt
Table Type: MANAGED_TABLE
Table Parameters:
numFiles 500
totalSize 21962504302
transient_lastDdlTime 1677121747
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim \t
serialization.format \t
hive> desc formatted student_tb_txt_bigfile;
OK
# col_name data_type comment
s_no string ������
s_name string ������
s_birth string ������
s_age bigint ������
s_sex string ������
s_score bigint ������������������
s_desc string ������������
# Detailed Table Information
Database: default
Owner: root
CreateTime: Thu Feb 23 04:48:35 UTC 2023
LastAccessTime: UNKNOWN
Retention: 0
Location: hdfs://namenode:8020/user/hive/warehouse/student_tb_txt_bigfile
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
numFiles 1
numRows 20000000
rawDataSize 21939038344
totalSize 21959038344
transient_lastDdlTime 1677127884
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim \t
serialization.format \t
文件数量不一样,500个小文件 和 1个大文件
单个大文件的执行耗时 Time taken: 288.97 seconds
(比上面的稍微快点,不同的环境比较的结果会有差异,跟书上的不一致)
5. 数据格式
- SequenceFile,早期 hadoop 广泛应用
- Parquet,兼容多种引擎
- ORC,优化的数据格式,是目前的主流格式
创建其他数据格式的相同表
--创建表:student_tb_seq结构和student_,存储在用sequencefile
--step 1.创建一张和student_tb_txt表结构一样的student_tb_seq,存储在用sequencefile
create table if not exists student_tb_seq like student_tb_orc STORED as SEQUENCEFILE;
--step 2.拷贝student_tb_orc的数据到student_tb_seq
insert into table student_tb_seq
select * from student_tb_orc;
--step 3.创建表student_tb_par,表结构和student_tb_seq一样的,存储采用Parquet
create table if not exists student_tb_par like student_tb_orc STORED as PARQUET;
--step 4.拷贝student_tb_orc的数据到student_tb_par
insert overwrite table student_tb_par
select * from student_tb_orc;
分别执行 相同的查询
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string)
STORED AS {data_type};
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
from student_tb_{type}
INSERT into table student_stat partition(tp)
select s_age,min(s_birth) stat,'max' stat
GROUP by s_age
insert into table student_stat partition(tp)
select s_age,max(s_birth) stat,'min' stat
GROUP by s_age;
数据格式 | 耗时 s |
---|---|
TEXTFILE | 306 |
Sequencefile | 147 |
parquet | 55 |
orc | 54 |
6. 分区
- hive 的分区就是 表的数据存储目录下的子目录
hdfs dfs -ls -R hdfs://namenode:8020/user/hive/warehouse/
可以查看已有的目录和文件
DROP TABLE if EXISTS student_orc_partition;
create table student_orc_partition(
s_no string,
s_name string,
s_birth string,
s_sex string,
s_score bigint,
s_desc string
) partitioned by (s_age bigint) STORED AS orc;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT into table student_orc_partition partition(s_age)
select s_no, s_name, s_birth, s_sex, s_score, s_desc, s_age
from student_tb_orc
注意:分区字段一定要写在 select 最后
7. 分桶
- 分桶是更细粒度的数据划分,对列值
取hash % 桶数
,分桶不改变数据目录结果,只改变数据在文件中的分布
--如果存在student_orc_bucket表就删除
DROP TABLE if EXISTS student_orc_bucket;
--创建student_orc_bucket桶表s_age为分桶列
create table if not exists student_orc_bucket(
s_no string,
s_name string,
s_birth string,
s_age bigint,
s_sex string,
s_score bigint,
s_desc string
)
--分成16个桶
clustered BY (s_age) INTO 16 BUCKETS
STORED AS ORC;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.enforce.bucketing = true;
insert into table student_orc_bucket
select s_no,s_name,s_birth,s_age,s_sex,s_score,s_desc
from student_tb_orc;
- 既分区又分桶
DROP TABLE if EXISTS student_orc_partition_bucket;
--创建student_orc_partition_bucket,以part为分区列,s_age为分桶列
--part等于对s_no取hash值的结果取模10,即pmod(hash(s_no),10)
create table if not exists student_orc_partition_bucket(
s_no string ,
s_name string ,
s_birth string ,
s_age string,
s_sex string,
s_score bigint,
s_desc string
)
partitioned by(part bigint)
clustered BY (s_age) INTO 16 BUCKETS
STORED AS ORC;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.enforce.bucketing = true;
insert into table student_orc_partition_bucket partition(part)
select s_no,s_name,s_birth,s_age,s_sex,s_score,s_desc, pmod(hash(s_no),10) part
from student_tb_orc;
共有 10 个分区,每个分区下有16个文件
8. 分桶分区比较
select s_age, count(s_no)
from {table}
where s_age < 23
group by s_age
table | 执行以上sql 时间 s | HDFS Read |
---|---|---|
student_tb_orc 普通表 | 24.3 | 118960302 |
student_orc_partition 分区表 | 21.4 | 70777620 |
student_orc_bucket 分桶表 | 23.9 | 136478970 |
student_orc_partition_bucket 分区分桶表 | 38.9 | 161422452 |
以上只是说明 有差异,究竟谁好谁坏,需要特定环境下进行测试和选择
最后 hive 调优跟 hive 自身的迭代也有差异,需要注意
相关文章
- 快速导出Oracle数据库表记录SQL语句大全(oracle导出表数据sql)
- Linux下将SQL文件导入数据库的命令指南(linux导入sql文件命令)
- 「一键导出MySQL SQL文件,轻松备份数据库数据」(mysql导出sql工具)
- Mysql中删除记录的SQL语句(mysql删除sql语句)
- 微耕技术助力SQL Server性能优化(微耕 sqlserver)
- 数据合法性使用SQL语句判断MSSQL数据合法性(sql判断mssql)
- 快速高效的MSSQL导入SQL方法,让数据转移无压力。(mssql导入sql)
- SQL Server活跃表:善用它获取性能收益(sqlserver活跃表)
- SQL Server慢速日志:精准定位网站性能瓶颈(sqlserver慢日志)
- 性能报告Oracle季度SQL性能分析报告(oracle 季度sql)
- 优化SQL Server主节点,提升系统性能(sqlserver主节点)
- MySQL如何进行批量导入SQL数据?(mysql批量导入sql)
- 精通Oracle元数据之SQL编程(oracle元数据sql)
- MySQL高级技巧学会用两条SQL拼接查询数据(mysql两条sql拼接)
- Oracle与SQL连接之路追求数据的完美结合(oracle与sql链接)
- Oracle SQL让你的数据不再缺位(oracle sql补位)
- 使用Oracle SQL处理文本数据的方法(oracle sql文本)
- 库Oracle SQL数据库指引未来发展路径(oracle sql数据)