Hive高级应用(三)
1.export导出数据 :
EXPORT TABLE db_hive.order TO /user/caizhengjie/datas/export/order
2.import导入数据
import table order_imp from /user/caizhengjie/datas/export/order
IMPORT TABLE order_imp_1 FROM /user/caizhengjie/datas/export/order LOCATION /user/kfk/datas/imp/order -- location 指定数据表目录
准备数据
/opt/datas/hive/order.txt
0001,cenry,2018-10-09,product-1,350,guangzhou 0002,beny,2018-09-09,product-2,180,beijing 0003,ben,2018-09-09,product-3,580,beijing 0004,cherry,2018-10-09,product-4,450,shenzheng 0005,jone,2018-10-09,product-5,30,nanjing 0006,lili,2018-09-09,product-6,50,hangzhou 0007,chen,2018-10-09,product-7,90,wuhan 0008,wiwi,2018-09-09,product-8,150,chengdu
创建表
create table order(userid string,username string,order_date string,product_name string,price int,city string) row format delimit
1.字段查询
select userid,username from order;
userid username 0001 cenry 0002 beny 0003 ben 0004 cherry 0005 jone 0006 lili 0007 chen 0008 wiwi
2.where查询
select * from order where price 200;
order.userid order.username order.order_date order.product_name order.price order.city 0001 cenry 2018-10-09 product-1 350 guangzhou 0003 ben 2018-09-09 product-3 580 beijing 0004 cherry 2018-10-09 product-4 450 shenzheng
3.limit查询
select * from order limit 3;
order.userid order.username order.order_date order.product_name order.price order.city 0001 cenry 2018-10-09 product-1 350 guangzhou 0002 beny 2018-09-09 product-2 180 beijing 0003 ben 2018-09-09 product-3 580 beijing
4.distinct去重
select distinct city from order;
city beijing chengdu guangzhou hangzhou nanjing shenzheng wuhan
5.max/min/count/sum
select max(price) from order;
_c0
6.group by / having 分组查询
select sum(price) price,city from order group by city having price 200;
price city 760 beijing 350 guangzhou 450 shenzheng
准备数据order1.txt
1,0001,2018-09-09,product-1,180 2,0001,2018-09-09,product-2,240 3,0001,2018-09-09,product-3,350 4,0002,2018-09-09,product-4,110 5,0002,2018-09-09,product-5,230 6,0003,2018-09-09,product-6,245 7,0004,2018-09-09,product-7,965 8,0004,2018-09-09,product-8,741 9,0008,2018-09-09,product-8,690 10,0009,2018-09-09,product-9,120 11,0009,2018-09-09,product-9,120
customer.txt
0001,jack,17425845325,guangzhou 0002,beny,15451225965,shenzhen 0003,jone,13754859326,nanjing 0004,cherry,13785415255,suzhou 0005,alex,15745869325,beijing 0006,aili,13245632581,xiamen
创建order1表
create table if not exists db_hive.order1(orderid string,userid string,order_date string,product_name string,price string) row format delimited fields terminated by , stored as textfile;
创建customer表
create table if not exists db_hive.customer(userid string,username string,telephone string,city string) row format delimited fields terminated by , stored as textfile;
加载数据
load data local inpath /opt/datas/hive/order1.txt into table order1; load data local inpath /opt/datas/hive/customer.txt into table customer;
1.等值连接
select * from customer t1,order1 t2 where t1.userid t2.userid;
t1.userid t1.username t1.telephone t1.city t2.orderid t2.userid t2.order_date t2.product_name t2.price 0001 jack 17425845325 guangzhou 1 0001 2018-09-09 product-1 180 0001 jack 17425845325 guangzhou 2 0001 2018-09-09 product-2 240 0001 jack 17425845325 guangzhou 3 0001 2018-09-09 product-3 350 0002 beny 15451225965 shenzhen 4 0002 2018-09-09 product-4 110 0002 beny 15451225965 shenzhen 5 0002 2018-09-09 product-5 230 0003 jone 13754859326 nanjing 6 0003 2018-09-09 product-6 245 0004 cherry 13785415255 suzhou 7 0004 2018-09-09 product-7 965 0004 cherry 13785415255 suzhou 8 0004 2018-09-09 product-8 741
2.左连接
select t1.username,t2.product_name from customer t1 left join order1 t2 on t1.userid t2.userid;
t1.username t2.product_name jack product-1 jack product-2 jack product-3 beny product-4 beny product-5 jone product-6 cherry product-7 cherry product-8 alex NULL aili NULL
select t2.username,t1.product_name from order1 t1 left join customer t2 on t1.userid t2.userid;
t2.username t1.product_name jack product-1 jack product-2 jack product-3 beny product-4 beny product-5 jone product-6 cherry product-7 cherry product-8 NULL product-8 NULL product-9 NULL product-9
3.右连接
select t1.username,t2.product_name from customer t1 right join order1 t2 on t1.userid t2.userid;
t1.username t2.product_name jack product-1 jack product-2 jack product-3 beny product-4 beny product-5 jone product-6 cherry product-7 cherry product-8 NULL product-8 NULL product-9 NULL product-9
select t2.username,t1.product_name from order1 t1 right join customer t2 on t1.userid t2.userid;
t2.username t1.product_name jack product-1 jack product-2 jack product-3 beny product-4 beny product-5 jone product-6 cherry product-7 cherry product-8 alex NULL aili NULL
4.全连接
select t2.username,t1.product_name from order1 t1 full join customer t2 on t1.userid t2.userid;
t2.username t1.product_name jack product-3 jack product-2 jack product-1 beny product-5 beny product-4 jone product-6 cherry product-8 cherry product-7 alex NULL aili NULL NULL product-8 NULL product-9 NULL product-9
order /sort / cluster / distribute by
准备数据
order.userid order.username order.order_date order.product_name order.price order.city 0001 cenry 2018-10-09 product-1 350 guangzhou 0002 beny 2018-09-09 product-2 180 beijing 0003 ben 2018-09-09 product-3 580 beijing 0004 cherry 2018-10-09 product-4 450 shenzheng 0005 jone 2018-10-09 product-5 30 nanjing 0006 lili 2018-09-09 product-6 50 hangzhou 0007 chen 2018-10-09 product-7 90 wuhan 0008 wiwi 2018-09-09 product-8 150 chengdu
order by (只有一个reduce ,全局排序)
select * from order order by price desc; select * from order order by price asc;
order.userid order.username order.order_date order.product_name order.price order.city 0003 ben 2018-09-09 product-3 580 beijing 0004 cherry 2018-10-09 product-4 450 shenzheng 0001 cenry 2018-10-09 product-1 350 guangzhou 0002 beny 2018-09-09 product-2 180 beijing 0008 wiwi 2018-09-09 product-8 150 chengdu 0007 chen 2018-10-09 product-7 90 wuhan 0006 lili 2018-09-09 product-6 50 hangzhou 0005 jone 2018-10-09 product-5 30 nanjing
设置reduce个数 这里设置为3个
set mapreduce.job.reduces
但是不起作用 运行的时候还是一个。
每个reduce端都会进行排序 也就是局部有序 可以指定多个reduce。同时 如果想测试一下执行的效果 建议将输出结果保存到本地 并调整reduce的数量。 我设置成3个
sort by (对每一个reduce内部的数据进行排序 最后的全局结果集不排序)
设置reduce个数 这里设置为3个
set mapreduce.job.reduces
这里的写法是将数据写入到本地
insert overwrite local directory /opt/datas/hive/sort row format delimited fields terminated by , select * from order sort by price desc;
[caizhengjie bigdata-pro-m01 sort]$ ls 000000_0 000001_0 000002_0 [caizhengjie bigdata-pro-m01 sort]$ cat 000000_0 0008,wiwi,2018-09-09,product-8,150,chengdu 0007,chen,2018-10-09,product-7,90,wuhan 0006,lili,2018-09-09,product-6,50,hangzhou 0005,jone,2018-10-09,product-5,30,nanjing
[caizhengjie bigdata-pro-m01 sort]$ cat 000001_0 0003,ben,2018-09-09,product-3,580,beijing 0004,cherry,2018-10-09,product-4,450,shenzheng 0002,beny,2018-09-09,product-2,180,beijing
[caizhengjie bigdata-pro-m01 sort]$ cat 000002_0 0001,cenry,2018-10-09,product-1,350,guangzhou
指定分区原则。通常和sort by一起用 distribute by必须要写在sort by之前。理解成 按照XX字段分区 再按照XX字段排序。
distribute by (作用类似于partition,一般与sort by 一起使用)
设置reduce个数 这里设置为3个
set mapreduce.job.reduces
insert overwrite local directory /opt/datas/hive/sort row format delimited fields terminated by , select * from order distribute by city sort by price desc;
[caizhengjie bigdata-pro-m01 sort]$ ls 000000_0 000001_0 000002_0 [caizhengjie bigdata-pro-m01 sort]$ cat 000000_0 0004,cherry,2018-10-09,product-4,450,shenzheng 0001,cenry,2018-10-09,product-1,350,guangzhou
[caizhengjie bigdata-pro-m01 sort]$ cat 000001_0 0003,ben,2018-09-09,product-3,580,beijing 0002,beny,2018-09-09,product-2,180,beijing 0007,chen,2018-10-09,product-7,90,wuhan 0006,lili,2018-09-09,product-6,50,hangzhou 0005,jone,2018-10-09,product-5,30,nanjing
[caizhengjie bigdata-pro-m01 sort]$ cat 000002_0 0008,wiwi,2018-09-09,product-8,150,chengdu
当distribute by 和 sort by 所指定的字段相同时 即可以使用cluster by。
注意:cluster by指定的列只能是升序 不能指定asc和desc。
设置reduce个数 这里设置为3个
set mapreduce.job.reduces
cluster by (distribute by 和sort by字段相同时 使用cluster by 代替)
insert overwrite local directory /opt/datas/hive/sort row format delimited fields terminated by , select * from order distribute by username sort by username;
[caizhengjie bigdata-pro-m01 sort]$ cat 000000_0 0003,ben,2018-09-09,product-3,580,beijing 0007,chen,2018-10-09,product-7,90,wuhan 0004,cherry,2018-10-09,product-4,450,shenzheng 0006,lili,2018-09-09,product-6,50,hangzhou
等价于
insert overwrite local directory /opt/datas/hive/sort row format delimited fields terminated by , select * from order cluster by username;
[caizhengjie bigdata-pro-m01 sort]$ cat 000000_0 0003,ben,2018-09-09,product-3,580,beijing 0007,chen,2018-10-09,product-7,90,wuhan 0004,cherry,2018-10-09,product-4,450,shenzheng 0006,lili,2018-09-09,product-6,50,hangzhou
1.自定义UDF函数
加载pom.xml文件内容
properties hive.version 0.13.1 /hive.version /properties
dependency groupId org.apache.hive /groupId artifactId hive-exec /artifactId version ${hive.version} /version /dependency
编写udf代码
package com.kfk.hive; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; * author : 蔡政洁 * email :caizhengjie888 icloud.com * date : 2020/10/23 * time : 8:31 下午 public class KfkUDF extends UDF { public Text evaluate(final Text s) { if (s null) { return null; } return new Text(s.toString().toUpperCase()); public static void main(String[] args) { System.out.println(new KfkUDF().evaluate(new Text( spark )));
2.打jar包
3.添加jar包
hive (db_hive) add jar /opt/jars/bigdata_study_udf.jar; hive (db_hive) list jar;
4.Create Function
create temporary function kfk_udf as com.kfk.hive.KfkUDF
5.使用自定义函数
select kfk_udf(city) city from order;
city GUANGZHOU BEIJING BEIJING SHENZHENG NANJING HANGZHOU WUHAN CHENGDU
有关hive的UDF操作可以见官网
https://cwiki.apache.org/confluence/display/Hive/HivePlugins
1.启动metastore
bin/hive --service metastore
2.启动hiverserver2
bin/hiveserver2
3.启动beeline客户端
bin/beeline
4.连接hive
% bin/beeline beeline !connect jdbc:hive2:// host : port / db auth noSasl hiveuser pass
!connect jdbc:hive2://bigdata-pro-m01:10000 caizhengjie 199911
5.查看数据
1: jdbc:hive2://bigdata-pro-m01:10000 select * from order; --------------- ----------------- ------------------- --------------------- -------------- ------------- -- | order.userid | order.username | order.order_date | order.product_name | order.price | order.city | --------------- ----------------- ------------------- --------------------- -------------- ------------- -- | 0001 | cenry | 2018-10-09 | product-1 | 350 | guangzhou | | 0002 | beny | 2018-09-09 | product-2 | 180 | beijing | | 0003 | ben | 2018-09-09 | product-3 | 580 | beijing | | 0004 | cherry | 2018-10-09 | product-4 | 450 | shenzheng | | 0005 | jone | 2018-10-09 | product-5 | 30 | nanjing | | 0006 | lili | 2018-09-09 | product-6 | 50 | hangzhou | | 0007 | chen | 2018-10-09 | product-7 | 90 | wuhan | | 0008 | wiwi | 2018-09-09 | product-8 | 150 | chengdu | --------------- ----------------- ------------------- --------------------- -------------- ------------- --
可以看出来 通过beeline客户端连接的hive比hive本身的命令显示出来的数据格式更加直观。
具体beeline操作可以详见官网
https://cwiki.apache.org/confluence/pages/viewpage.action?pageId 82903124#HiveServer2Clients-table
1.添加pom.xml配置文件
properties hive.version 0.13.1 /hive.version /properties
dependency groupId org.apache.hive /groupId artifactId hive-jdbc /artifactId version ${hive.version} /version /dependency
2.编写JDBC代码
package com.kfk.hive; * author : 蔡政洁 * email :caizhengjie888 icloud.com * date : 2020/10/23 * time : 9:53 下午 import java.sql.SQLException; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import java.sql.DriverManager; public class HiveJDBC { private static String driverName org.apache.hive.jdbc.HiveDriver * param args * throws SQLException public static void main(String[] args) throws SQLException { try { Class.forName(driverName); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); System.exit(1); //replace hive here with the name of the user the queries should run as Connection con DriverManager.getConnection( jdbc:hive2://bigdata-pro-m01:10000/db_hive , caizhengjie , 199911 Statement stmt con.createStatement(); String tableName order ResultSet res null; // select * query String sql select * from tableName; System.out.println( Running: sql); res stmt.executeQuery(sql); while (res.next()) { System.out.println(String.valueOf(res.getString(1)) \t res.getString(2));
运行结果
Running: select * from order 0001 cenry 0002 beny 0003 ben 0004 cherry 0005 jone 0006 lili 0007 chen 0008 wiwi
有关Hive与JDBC的集成操作见官网
Zeppelin框架及Hive查询操作 Apache Zeppelin是一款基于Web交互式框架,支持多种语言,Scala、SparkSQL、Markdown,SQL、Shell、Python等。可以使用Zeppelin链接SparkSQL。Zeppelin提供数据分析、数据可视化。打开浏览器 访问,xxxx:8090。Zeppelin安装和使用。一键启动集群中所有的组件。
Hive高级应用(一) Apache Hive是一个构建在hadoop基础设施之上的数据仓库。通过Hive可以使用HQL语言查询存放在HDFS上的数据。HQL是一种类SQL语言,这种语言最终被转化为MapReduce. 虽然Hive提供了SQL查询功能,但是Hive不能够进行交互查询–因为它只能够在Haoop上批量的执行Hadoop。
CDP中的Hive3系列之Hive性能调优 要将数据从 RDBMS(例如 MySQL)迁移到 Hive,您应该考虑使用带有 Teradata 连接器的 CDP 中的 Apache Sqoop。Apache Sqoop 客户端基于 CLI 的工具在关系数据库和 HDFS 或云对象存储之间批量传输数据。 需要进行提取、转换和加载 (ETL) 过程的遗留系统数据源通常驻留在文件系统或对象存储中。您还可以以分隔文本(默认)或 SequenceFile 格式导入数据,然后将数据转换为 Hive 推荐的 ORC 格式。通常,为了在 Hive 中查询数据,ORC 是首选格式,因为 ORC 提供了性能增强。
相关文章
- Pandas-高级处理(五):一般性“拆分-应用-合并”【一般化Groupby方法:apply】
- 2015330实验室例会之搜索引擎小应用
- CrossOver22可以直接在Mac系统上运行Windows应用
- 5款应用满足所有你对日历的需求
- Atmel推出面向物联网边缘节点应用、内置MCU的集成型Wi-Fi模块
- Fluter 应用调试
- Ext JS4百强应用: 用grid.plugin.CellEditing做高级查询 --第10强
- 浪潮发布新一代主机天梭M13 可承载大型机应用
- 云联云通讯报错:应用与模板id不匹配,解决方法
- SanDisk闪迪丰富解决方案组合 助汽车和物联网应用
- Rsync的一个高级应用
- Neo4j高级应用技术专题系列 - APOC存储过程库-【1】概述
- Java多线程与并发库高级应用-工具类介绍
- Hive高级应用(一)
- Hive高级应用(二)
- cs231 pytorch ignite 框架高级API应用案例mnist_with_tensorboardx
- ReportPortal 在python下的应用介绍之----python各框架整合
- Spring在非web应用中关闭IoC容器 (registerShutdownHook)
- 【干货】阿里全息大数据构建与应用
- HBase应用实践专场-HBase问题排查思路
- java多线程并发库高级应用 之 java5中的线程并发库--线程池、Callable&Future
- 多线程并发库高级应用 之 使用java5中同步技术的3个面试题
- pfsense 2.2RC版本应用
- C语言高级应用---操作linux下V4L2摄像头应用程序
- dotnet 在 UOS 国产系统上使用 MonoDevelop 进行拖控件开发 GTK 应用
- 国产环境小卫星高光谱数据应用于沿海植被环境监测
- 数据库语句高级应用之修改数据库兼容版本