官方文档
1、数据库
默认数据库为default
-- 创建数据库
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
[COMMENT database_comment]
[LOCATION hdfs_path]
[WITH DBPROPERTIES (property_name=property_value, ...)];
-- 删除数据库
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
-- 修改数据库
ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...);
ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role;
-- 切换数据库
USE database_name;
USE DEFAULT;
2、创建表
-- 方式一
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path] -- 从hdfs目录添加数据
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (数据为子查询结果 not supported for external tables)
-- 方式二 : 复制另一个表的结构
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
-- 数据类型
data_type
: primitive_type
| array_type
| map_type
| struct_type
| union_type -- (Note: Available in Hive 0.7.0 and later)
-- 基本类型
primitive_type
: TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
| STRING
| BINARY -- (Note: Available in Hive 0.8.0 and later)
| TIMESTAMP -- (Note: Available in Hive 0.8.0 and later)
| DECIMAL -- (Note: Available in Hive 0.11.0 and later)
| DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later)
| DATE -- (Note: Available in Hive 0.12.0 and later)
| VARCHAR -- (Note: Available in Hive 0.12.0 and later)
| CHAR -- (Note: Available in Hive 0.13.0 and later)
-- 数组
array_type
: ARRAY < data_type >
-- 键值对
map_type
: MAP < primitive_type, data_type >
-- 自定义
struct_type
: STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type
: UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
-- 行格式解析定义
row_format
: DELIMITED
[FIELDS TERMINATED BY char [ESCAPED BY char]] -- 字段分割符
[COLLECTION ITEMS TERMINATED BY char] -- 集合元素分隔符
[MAP KEYS TERMINATED BY char] -- map key与value分隔符
[LINES TERMINATED BY char] -- 行分隔符
[NULL DEFINED AS char] -- 空值定义 (Note: Available in Hive 0.13 and later)
-- 自己编程实现 指定类名
| SERDE serde_name
-- 属性值
[WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
-- 文件格式定义
file_format:
: SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE -- (Note: Available in Hive 0.6.0 and later)
| ORC -- (Note: Available in Hive 0.11.0 and later)
| PARQUET -- (Note: Available in Hive 0.13.0 and later)
| AVRO -- (Note: Available in Hive 0.14.0 and later)
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
-- 约束定义
constraint_specification:
: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]
[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
例子:
数据
1,devin,22,game-study-beauty,beijing:haidian-changsha:nueluqu
2,liuxinrong,21,movie-study-man,beijing:haidian-changsha:ningxiang
创建一个内部表 --> 使此创表语句能直接复制到hive命令行接口,每行前面不能有制表符即不能按tab,不然会乱
create table psn (
id int,
name string,
age int,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n';
创建 一张外部表
create external table psn2 (
id int,
name string,
age int,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n'
location '/data/hive/psn2/'; -- local需要为hdfs目录,切不能为文件
复制表结构创建表
create table psn_copy like psn;
从查询结果创建表
create table psn_select -- 这里不能指定列
as select id,name,age from psn; -- 不能从外部表
-- 从查询结果创建表,同时指定分隔符
create table if not exists psn_select_format
row format delimited
fields terminated by ' '
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n'
as select id,name,age from psn;
select address['city'] from person where name='zs';
显示表结构
describe/desc [extended|formatted] table_name
-- formatted 查看更详细
-- 显示location 和 表类型,可以看是内部表还是外部表
3、删除表
DROP TABLE [IF EXISTS] table_name [PURGE];
-- 指定PURGE数据不会移动到.Trash/Current 会被完全删除
-- 如果表不存在会返回错误,需加if exists
删除内部表时,元数据与数据都会被删除
删除外部表只删除metastore的元数据,不删除hdfs中的表数据
4、导入数据
-- 语法
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
-- 加载hdfs数据
load data inpath '/root/hivedata' overwrite into table psn;
-- 加载本地数据
load data local inpath '/root/hivedata' overwrite into table psn;
-- filepath不能包括子目录
load data 方式添加【hdfs】数据,不会触发MR任务
外部表数据文件都会被移动【注意是移动,不是复制】至建表时location指定目录下
建表时未指定location则和内部表一样移动到hive工作空间下
从查询结果导入数据,触发MR
-- 语法
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
-- 例子 insert后不能指定col
from psn
insert into table rowcount select count(*); -- into 追加 overwrite 覆盖
5、分区partition
必须在表定义时创建partition
create table psn3 (
id int,
name string
)
partitioned by (sex string,age int) -- 指定分区 分区字段与表的字段 不能重复
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n';
先以sex=value为文件夹,再以age=value子文件夹区分
partitioned by (age int) 分区里定义了age,小括号内的属性就不要定义age了,同时age的值都是后面手动指定的,即分区名
partition分区相当于给一类数据加了索引
就是辅助查询,缩小查询范围,加快数据的检索速度和对数据按照一定的规格和条件进行管理
导入数据时指定分区
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
partition_spec:
: (partition_column = partition_col_value, partition_column = partition_col_value, ...)
-- 例子
alter table psn3 add partition(sex='nan',age=21) location '/user/root/data/';
-- location 内部表外部表都不会移动,不会创建分区目录
load data inpath '/user/root/data/hivedata2' overwrite into table psn3 partition(sex='nan',age=21);
-- load 内部表外部表 都会移动至location位置或工作目录,会创建分区目录
from psn
insert into table psn_partition partition(sex='nan',age=21)
select id,name,age from psn;
-- 从查询导入时数据指定分区
分区作为查询条件
select * from psn4 where sex = 'nv';
不会扫描全表, 速度快
alter table person_partition drop partition (sex='nv',age=21);
-- 只会删除sex=nv 目录下的 age=21 文件夹
alter table person_partition drop partition (age=21);
-- sex=nv 和 sex=nan 目录下的 age=21 文件夹都会删除
如果是内部表,无论采用load加载还是location添加数据,删除分区或删除表后,数据文件和目录都【会】被删除
如果是外部表,无论采用load加载还是location添加数据,删除分区或删除表后,数据文件和目录都【不会】被删除
查看分区
show partitions psn3;
手动删除乱码分区,在 MySQL 中执行
-- 手动删除分区,当分区名乱码时
dfs -rm -r /user/hive_local/warehouse/psn_partition/sex=nan -- 删除数据
SELECT * FROM TBLS WHERE TBL_NAME='psn_partition'; -- 查出表id
select * from PARTITIONS t where t.tbl_id='61'; -- 查出partition id
select * from PARTITION_KEY_VALS where part_id=16;
select * from PARTITION_PARAMS where part_id=16;
delete from PARTITION_KEY_VALS where part_id=31; -- 删除
delete from PARTITION_PARAMS where part_id=31;
delete from PARTITIONS where tbl_id=61 and part_id=31;
6、导出hive表至文件
将查询结果写入文件系统( 不加local为导出到hdfs )
insert overwrite local directory '/root/hive/person_result'
select * from psn;
-- 默认字段之间^A间隔,行\n,可修改
insert overwrite local directory '/root/hive/person_result'
row format delimited
fields terminated by '\t'
collection items terminated by '-'
map keys terminated by ':'
select * from psn;
-- 导出指定格式
7、Hive SerDe
Hive SerDe - Serializer and Deserializer
SerDe 用于做序列化和反序列化。
构建在数据存储和执行引擎之间,对两者实现解耦。
Hive通过ROW FORMAT DELIMITED以及SERDE进行内容的读写。
SerDe is a short name for “Serializer and Deserializer.”
Hive uses SerDe (and !FileFormat) to read and write table rows.
HDFS files –> InputFileFormat –> <key, value> –> Deserializer –> Row object
Row object –> Serializer –> <key, value> –> OutputFileFormat –> HDFS files
语法
row_format
SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
例子 :清洗tomcat日志
logs格式:
127.0.0.1 - - [14/Jul/2017:18:26:01 +0800] "GET /index.jsp HTTP/1.1" 200 11452
需求:去掉日期的中括号,去掉请求的双引号
create table if not exists blog_access_logs (
host string,
identity string,
tomcat_user string,
time string,
request string,
status string,
length string
)
partitioned by (day string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' -- 指定序列化反序列类
with serdeproperties (
'input.regex'='([^ ]*?) ([^ ]*?) ([^ ]*?) \\[(.*?)\\] \\"(.*?)\\" (-|[0-9]*) (-|[0-9]*)' -- 指定序列化参数
)
stored as textfile;
7、Hive JDBC
服务端启动hiveserver2后,在java代码中通过调用hive的jdbc访问默认端口10000进行连接、访问
启动HiverServer2
./bin/hiveserver2
Maven pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>site.imcoder</groupId>
<artifactId>review-hive</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoopVersion>2.6.5</hadoopVersion>
<!-- compile -->
<hadoopScope>provided</hadoopScope>
<hiveVersion>1.2.1</hiveVersion>
<hiveScope>provided</hiveScope>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoopVersion}</version>
<scope>${hadoopScope}</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hiveVersion}</version>
<scope>${hiveScope}</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hiveVersion}</version>
<scope>${hiveScope}</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 指定maven编译jdk版本 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
jdbc代码
package jdbc;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.sql.*;
/**
* @author Jeffrey.Deng
* @date 2017-09-19
*/
public class HiveQuery {
private final static String driver = "org.apache.hive.jdbc.HiveDriver";
private final static String url = "jdbc:hive2://node02:10000/default";
private final static String username = "root";
private String password = "";
private Connection conn = null;
private Statement stmt = null;
private ResultSet rs = null;
@Before
public void setup(){
try {
Class.forName(driver);
conn = DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
@After
public void close() {
try {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
if (conn != null) {
conn.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void query() {
//String sql = "select call, count(call) as cc from psn group by call order by cc desc";
String sql = "select host, count(*) as rc from blog_access_logs group by host order by rc desc";
System.out.println("query sql: " + sql);
try {
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
int columnCount = rs.getMetaData().getColumnCount();
while(rs.next()) {
for (int i = 1; i < columnCount; i++) {
System.out.print(rs.getString(i) + "\t");
}
System.out.println(rs.getString(columnCount));
}
} catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void loadFunc() {
// 注册函数
String addJarSql = "add jar /root/2017/udf.jar";
String loadFormatFuncSql = "create temporary function format_int as 'udf.FormatIntUDF'";
String loadCountFuncSql = "create temporary function mycount as 'udf.CountUDAF'";
try {
stmt = conn.createStatement();
System.out.println("execute sql: " + addJarSql);
stmt.execute(addJarSql);
System.out.println("execute sql: " + loadFormatFuncSql);
stmt.execute(loadFormatFuncSql);
System.out.println("execute sql: " + loadCountFuncSql);
stmt.execute(loadCountFuncSql);
} catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void useFunc() {
// 注册函数
loadFunc();
// 使用函数
String sql = "select format_int(id, 4) as fi, name , call, tags[0], props['cat'] from psn";
System.out.println("query sql: " + sql);
try {
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
print(rs);
} catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void udafFunc() {
// 注册函数
loadFunc();
// 使用函数
String sql = "select call, mycount(*) as cn from psn group by call order by cn desc";
//String sql = "select tag, mycount(tag) as cn from psn lateral view explode(tags) lve_psn as tag group by tag order by cn desc";
System.out.println("query sql: " + sql);
try {
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
print(rs);
} catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void lateralView() {
// lateral view 结合 explode等udtf函数
String sql = "select tag, count(tag) as tn from psn lateral view explode(tags) lve_psn as tag group by tag order by tn desc";
try {
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
print(rs);
} catch (SQLException e) {
e.printStackTrace();
}
}
public void print(ResultSet rs) {
try {
int columnCount = rs.getMetaData().getColumnCount();
while(rs.next()) {
for (int i = 1; i < columnCount; i++) {
System.out.print(rs.getString(i) + "\t");
}
System.out.println(rs.getString(columnCount));
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
Hive函数
一、内置函数
函数文档下载:Hive-LanguageManualUDF.pdf
官方地址:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
二、自定义函数
自定义函数包括三种 UDF、UDAF、UDTF
UDF(User-Defined-Function) 一进一出 ,既一行进一行出
UDAF(User- Defined Aggregation Funcation)聚集函数,多进一出(多行进一行出)。Count/max/min
UDTF(User-Defined Table-Generating Functions)一进多出,如 explore()
使用方式 :在 HIVE 会话中 add 自定义函数的 jar 文件,然后创建 function 继而使用函数