Hive SQL 的语法与使用

官方文档
1、数据库
    默认数据库为default

-- 创建数据库
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
  [COMMENT database_comment]
  [LOCATION hdfs_path]
  [WITH DBPROPERTIES (property_name=property_value, ...)];
  
-- 删除数据库
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];

-- 修改数据库
ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...); 
 
ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role; 

-- 切换数据库
USE database_name;
USE DEFAULT;
2、创建表
-- 方式一
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
     [STORED AS DIRECTORIES]
  [
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path] -- 从hdfs目录添加数据
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  [AS select_statement];   -- (数据为子查询结果 not supported for external tables)
 
-- 方式二 : 复制另一个表的结构
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE existing_table_or_view_name
  [LOCATION hdfs_path];

-- 数据类型
data_type
  : primitive_type
  | array_type
  | map_type
  | struct_type
  | union_type  -- (Note: Available in Hive 0.7.0 and later)
 
-- 基本类型
primitive_type
  : TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
  | STRING
  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
  | DATE        -- (Note: Available in Hive 0.12.0 and later)
  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
  | CHAR        -- (Note: Available in Hive 0.13.0 and later)

-- 数组
array_type
  : ARRAY < data_type >

-- 键值对
map_type
  : MAP < primitive_type, data_type >

-- 自定义
struct_type
  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
 
union_type
   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)

-- 行格式解析定义   
row_format
  : DELIMITED 
		[FIELDS TERMINATED BY char [ESCAPED BY char]] -- 字段分割符
		[COLLECTION ITEMS TERMINATED BY char] -- 集合元素分隔符
        [MAP KEYS TERMINATED BY char] -- map key与value分隔符
		[LINES TERMINATED BY char] -- 行分隔符
        [NULL DEFINED AS char]   -- 空值定义 (Note: Available in Hive 0.13 and later)
	-- 自己编程实现 指定类名
  | SERDE serde_name
		-- 属性值
		[WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
 
-- 文件格式定义
file_format:
  : SEQUENCEFILE
  | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
  | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
  | ORC         -- (Note: Available in Hive 0.11.0 and later)
  | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
  | AVRO        -- (Note: Available in Hive 0.14.0 and later)
  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname

-- 约束定义
constraint_specification:
  : [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]
    [, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE 
例子:
数据
1,devin,22,game-study-beauty,beijing:haidian-changsha:nueluqu
2,liuxinrong,21,movie-study-man,beijing:haidian-changsha:ningxiang
创建一个内部表   --> 使此创表语句能直接复制到hive命令行接口,每行前面不能有制表符即不能按tab,不然会乱
create table psn (
 id int,
 name string,
 age int,
 likes array<string>,
 address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n';

创建 一张外部表

create external table psn2 (
 id int,
 name string,
 age int,
 likes array<string>,
 address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n'
location '/data/hive/psn2/'; -- local需要为hdfs目录,切不能为文件

复制表结构创建表

create table psn_copy like psn;

从查询结果创建表

create table psn_select  -- 这里不能指定列
as select id,name,age from psn; -- 不能从外部表
-- 从查询结果创建表,同时指定分隔符
create table if not exists psn_select_format
row format delimited
fields terminated by ' '
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n'
as select id,name,age from psn;
select address['city'] from person where name='zs';
    显示表结构
describe/desc  [extended|formatted] table_name
-- formatted 查看更详细 
--			 显示location 和 表类型,可以看是内部表还是外部表
3、删除表
DROP TABLE [IF EXISTS] table_name [PURGE];
-- 指定PURGE数据不会移动到.Trash/Current 会被完全删除
-- 如果表不存在会返回错误,需加if exists
    删除内部表时,元数据与数据都会被删除
    删除外部表只删除metastore的元数据,不删除hdfs中的表数据
4、导入数据
-- 语法
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
-- 加载hdfs数据
load data inpath '/root/hivedata' overwrite into table psn;
-- 加载本地数据
load data local inpath '/root/hivedata' overwrite into table psn;
-- filepath不能包括子目录
    load data 方式添加【hdfs】数据,不会触发MR任务
外部表数据文件都会被移动【注意是移动,不是复制】至建表时location指定目录下
    建表时未指定location则和内部表一样移动到hive工作空间下
   从查询结果导入数据,触发MR
-- 语法
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
-- 例子 insert后不能指定col
from psn
insert into table rowcount select count(*);  -- into 追加 overwrite 覆盖
5、分区partition
    必须在表定义时创建partition 

create table psn3 (
 id int,
 name string
)
partitioned by (sex string,age int) -- 指定分区 分区字段与表的字段 不能重复
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
lines terminated by '\n';
先以sex=value为文件夹,再以age=value子文件夹区分
    partitioned by (age int) 分区里定义了age,小括号内的属性就不要定义age了,同时age的值都是后面手动指定的,即分区名
    partition分区相当于给一类数据加了索引
    就是辅助查询,缩小查询范围,加快数据的检索速度和对数据按照一定的规格和条件进行管理
    导入数据时指定分区
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
 partition_spec:
  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)

-- 例子

alter table psn3 add partition(sex='nan',age=21) location '/user/root/data/';
-- location 内部表外部表都不会移动,不会创建分区目录

load data inpath '/user/root/data/hivedata2' overwrite into table psn3 partition(sex='nan',age=21);
-- load 内部表外部表 都会移动至location位置或工作目录,会创建分区目录

from psn
insert into table psn_partition partition(sex='nan',age=21)
select id,name,age from psn;
-- 从查询导入时数据指定分区
    分区作为查询条件
select * from psn4 where sex = 'nv';

    不会扫描全表, 速度快

    删除分区数据
alter table person_partition drop partition (sex='nv',age=21);
-- 只会删除sex=nv 目录下的 age=21 文件夹
alter table person_partition drop partition (age=21);
-- sex=nv 和 sex=nan 目录下的 age=21 文件夹都会删除
    如果是内部表,无论采用load加载还是location添加数据,删除分区或删除表后,数据文件和目录都【会】被删除
    如果是外部表,无论采用load加载还是location添加数据,删除分区或删除表后,数据文件和目录都【不会】被删除
    查看分区
 show partitions psn3;
    手动删除乱码分区,在 MySQL 中执行
-- 手动删除分区,当分区名乱码时
dfs -rm -r /user/hive_local/warehouse/psn_partition/sex=nan -- 删除数据
SELECT * FROM TBLS WHERE TBL_NAME='psn_partition'; -- 查出表id
select * from PARTITIONS t where t.tbl_id='61';  -- 查出partition id
select * from PARTITION_KEY_VALS where part_id=16;
select * from PARTITION_PARAMS where  part_id=16;

delete from PARTITION_KEY_VALS where part_id=31; -- 删除
delete from PARTITION_PARAMS where  part_id=31;
delete from PARTITIONS where tbl_id=61 and part_id=31;
6、导出hive表至文件
    将查询结果写入文件系统( 不加local为导出到hdfs )
insert overwrite local directory '/root/hive/person_result'
select * from psn;
-- 默认字段之间^A间隔,行\n,可修改
insert overwrite local directory '/root/hive/person_result'
row format delimited 
fields terminated by '\t'
collection items terminated by '-' 
map keys terminated by ':' 
select * from psn;
-- 导出指定格式


7、Hive SerDe
Hive SerDe - Serializer and Deserializer
    SerDe 用于做序列化和反序列化。
    构建在数据存储和执行引擎之间,对两者实现解耦。
    Hive通过ROW FORMAT DELIMITED以及SERDE进行内容的读写。

SerDe is a short name for “Serializer and Deserializer.”
Hive uses SerDe (and !FileFormat) to read and write table rows.
HDFS files –> InputFileFormat –> <key, value> –> Deserializer –> Row object
Row object –> Serializer –> <key, value> –> OutputFileFormat –> HDFS files
语法
row_format
SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]

例子 :清洗tomcat日志

logs格式:

127.0.0.1 - - [14/Jul/2017:18:26:01 +0800] "GET /index.jsp HTTP/1.1" 200 11452

需求:去掉日期的中括号,去掉请求的双引号

create table if not exists blog_access_logs (
host string,
identity string,
tomcat_user string,
time string,
request string,
status string,
length string
)
partitioned by (day string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' -- 指定序列化反序列类
with serdeproperties ( 
'input.regex'='([^ ]*?) ([^ ]*?) ([^ ]*?) \\[(.*?)\\] \\"(.*?)\\" (-|[0-9]*) (-|[0-9]*)' -- 指定序列化参数
)
stored as textfile;


7、Hive JDBC
服务端启动hiveserver2后,在java代码中通过调用hive的jdbc访问默认端口10000进行连接、访问
    启动HiverServer2 
 ./bin/hiveserver2 
    Maven pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>site.imcoder</groupId>
    <artifactId>review-hive</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoopVersion>2.6.5</hadoopVersion>
        <!-- compile -->
        <hadoopScope>provided</hadoopScope>
        <hiveVersion>1.2.1</hiveVersion>
        <hiveScope>provided</hiveScope>
    </properties>

    <dependencies>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoopVersion}</version>
            <scope>${hadoopScope}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hiveVersion}</version>
            <scope>${hiveScope}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hiveVersion}</version>
            <scope>${hiveScope}</scope>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- 指定maven编译jdk版本 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>
    jdbc代码
package jdbc;

import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.sql.*;

/**
 * @author Jeffrey.Deng
 * @date 2017-09-19
 */
public class HiveQuery {

    private final static String driver = "org.apache.hive.jdbc.HiveDriver";
    private final static String url = "jdbc:hive2://node02:10000/default";
    private final static String username = "root";
    private String password = "";

    private Connection conn = null;
    private Statement stmt = null;
    private ResultSet rs = null;

    @Before
    public void setup(){
        try {
            Class.forName(driver);
            conn = DriverManager.getConnection(url, username, password);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @After
    public void close() {
        try {
            if (rs != null) {
                rs.close();
            }
            if (stmt != null) {
                stmt.close();
            }
            if (conn != null) {
                conn.close();
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void query() {
        //String sql = "select call, count(call) as cc from psn group by call order by cc desc";
        String sql = "select host, count(*) as rc from blog_access_logs group by host order by rc desc";
        System.out.println("query sql: " + sql);
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            int columnCount = rs.getMetaData().getColumnCount();
            while(rs.next()) {
                for (int i = 1; i < columnCount; i++) {
                    System.out.print(rs.getString(i) + "\t");
                }
                System.out.println(rs.getString(columnCount));
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void loadFunc() {
        // 注册函数
        String addJarSql = "add jar /root/2017/udf.jar";
        String loadFormatFuncSql = "create temporary function format_int as 'udf.FormatIntUDF'";
        String loadCountFuncSql = "create temporary function mycount as 'udf.CountUDAF'";
        try {
            stmt = conn.createStatement();
            System.out.println("execute sql: " + addJarSql);
            stmt.execute(addJarSql);
            System.out.println("execute sql: " + loadFormatFuncSql);
            stmt.execute(loadFormatFuncSql);
            System.out.println("execute sql: " + loadCountFuncSql);
            stmt.execute(loadCountFuncSql);
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void useFunc() {
        // 注册函数
        loadFunc();
        // 使用函数
        String sql = "select format_int(id, 4) as fi, name , call, tags[0], props['cat'] from psn";
        System.out.println("query sql: " + sql);
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            print(rs);
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void udafFunc() {
        // 注册函数
        loadFunc();
        // 使用函数
        String sql = "select call, mycount(*) as cn from psn group by call order by cn desc";
        //String sql = "select tag, mycount(tag) as cn from psn lateral view explode(tags) lve_psn as tag group by tag order by cn desc";
        System.out.println("query sql: " + sql);
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            print(rs);
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void lateralView() {
        // lateral view 结合 explode等udtf函数
        String sql = "select tag, count(tag) as tn from psn lateral view explode(tags) lve_psn as tag group by tag order by tn desc";
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            print(rs);
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    public void print(ResultSet rs) {
        try {
            int columnCount = rs.getMetaData().getColumnCount();
            while(rs.next()) {
                for (int i = 1; i < columnCount; i++) {
                    System.out.print(rs.getString(i) + "\t");
                }
                System.out.println(rs.getString(columnCount));
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

}


Hive函数
一、内置函数
 函数文档下载:Hive-LanguageManualUDF.pdf
 官方地址:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
二、自定义函数
    自定义函数包括三种 UDF、UDAF、UDTF 
                UDF(User-Defined-Function) 一进一出 ,既一行进一行出
                UDAF(User- Defined Aggregation Funcation)聚集函数,多进一出(多行进一行出)。Count/max/min 
                UDTF(User-Defined Table-Generating Functions)一进多出,如 explore() 
    使用方式 :在 HIVE 会话中 add 自定义函数的 jar 文件,然后创建 function 继而使用函数 
   自定义函数方法查看另外一篇文章 Hive自定义函数:UDF、UDAF、UDTF

添加新评论