CREATE TABLE语句用于在现有数据库中定义一个表。
CREATE TABLE语句使用数据源定义一个新表。
CREATE TABLE [ IF NOT EXISTS ] table_identifier [ ( col_name1 col_type1 [ COMMENT col_comment1 ], ... ) ] USING data_source [ OPTIONS ( key1=val1, key2=val2, ... ) ] [ PARTITIONED BY ( col_name1, col_name2, ... ) ] [ CLUSTERED BY ( col_name3, col_name4, ... ) [ SORTED BY ( col_name [ ASC | DESC ], ... ) ] INTO num_buckets BUCKETS ] [ LOCATION path ] [ COMMENT table_comment ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ AS select_statement ]
注意,USING子句和AS select子句之间的子句可以按任意顺序出现。例如,可以在TBLPROPERTIES后面写COMMENT表注释。
数据源表的作用类似于指向基础数据源的指针。例如,你可以使用JDBC数据源在Spark中创建一个表“foo”,该表指向MySQL中的一个表”bar“。当你读/写表“foo”时,你实际上读/写了表“bar”。
一般来说,CREATE TABLE是在创建一个“指针”,你需要确保它指向现有的东西。一个例外是文件源,如parquet、json。如果你没有指定LOCATION,Spark将为你创建一个默认的表位置。
对于CREATE TABLE AS select with LOCATION,如果给定位置作为非空目录存在,Spark将抛出分析异常。如果spark.sql.legacy.allowNonEmptyLocationInCTAS设置为true,spark将用输入查询的数据覆盖基础数据源,以确保创建的表包含与输入查询完全相同的数据。
--Use data source CREATE TABLE student (id INT, name STRING, age INT) USING CSV; --Use data from another table CREATE TABLE student_copy USING CSV AS select * FROM student; --Omit the USING clause, which uses the default data source (parquet by default) CREATE TABLE student (id INT, name STRING, age INT); --Use parquet data source with parquet storage options --The columns 'id' and 'name' enable the bloom filter during writing parquet file, --column 'age' does not enable CREATE TABLE student_parquet(id INT, name STRING, age INT) USING PARQUET OPTIONS ( 'parquet.bloom.filter.enabled'='true', 'parquet.bloom.filter.enabled#age'='false' ); --Specify table comment and properties CREATE TABLE student (id INT, name STRING, age INT) USING CSV COMMENT 'this is a comment' TBLPROPERTIES ('foo'='bar'); --Specify table comment and properties with different clauses order CREATE TABLE student (id INT, name STRING, age INT) USING CSV TBLPROPERTIES ('foo'='bar') COMMENT 'this is a comment'; --Create partitioned and bucketed table CREATE TABLE student (id INT, name STRING, age INT) USING CSV PARTITIONED BY (age) CLUSTERED BY (Id) INTO 4 buckets; --Create partitioned and bucketed table through CTAS CREATE TABLE student_partition_bucket USING parquet PARTITIONED BY (age) CLUSTERED BY (id) INTO 4 buckets AS select * FROM student; --Create bucketed table through CTAS and CTE CREATE TABLE student_bucket USING parquet CLUSTERED BY (id) INTO 4 buckets ( WITH tmpTable AS ( select * FROM student WHERE id > 100 ) select * FROM tmpTable );
CREATE TABLE语句使用Hive格式定义一个新表。
CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier [ ( col_name1[:] col_type1 [ COMMENT col_comment1 ], ... ) ] [ COMMENT table_comment ] [ PARTITIONED BY ( col_name2[:] col_type2 [ COMMENT col_comment2 ], ... ) | ( col_name1, col_name2, ... ) ] [ CLUSTERED BY ( col_name1, col_name2, ...) [ SORTED BY ( col_name1 [ ASC | DESC ], col_name2 [ ASC | DESC ], ... ) ] INTO num_buckets BUCKETS ] [ ROW FORMAT row_format ] [ STORED AS file_format ] [ LOCATION path ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ AS select_statement ]
请注意,列定义子句和AS select子句之间的子句可以按任何顺序出现。例如,可以在TBLPROPERTIES之后编写COMMENT table_comment。
--Use hive format CREATE TABLE student (id INT, name STRING, age INT) STORED AS ORC; --Use data from another table CREATE TABLE student_copy STORED AS ORC AS select * FROM student; --Specify table comment and properties CREATE TABLE student (id INT, name STRING, age INT) COMMENT 'this is a comment' STORED AS ORC TBLPROPERTIES ('foo'='bar'); --Specify table comment and properties with different clauses order CREATE TABLE student (id INT, name STRING, age INT) STORED AS ORC TBLPROPERTIES ('foo'='bar') COMMENT 'this is a comment'; --Create partitioned table CREATE TABLE student (id INT, name STRING) PARTITIONED BY (age INT) STORED AS ORC; --Create partitioned table with different clauses order CREATE TABLE student (id INT, name STRING) STORED AS ORC PARTITIONED BY (age INT); --Use Row Format and file format CREATE TABLE student (id INT, name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; --Use complex datatype CREATE EXTERNAL TABLE family( name STRING, friends ARRAY<STRING>, children MAP<STRING, INT>, address STRUCT<street: STRING, city: STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ESCAPED BY '\\' COLLECTION ITEMS TERMINATED BY '_' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n' NULL DEFINED AS 'foonull' STORED AS TEXTFILE LOCATION '/tmp/family/'; --Use predefined custom SerDe CREATE TABLE avroExample ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' TBLPROPERTIES ('avro.schema.literal'='{ "namespace": "org.apache.hive", "name": "first_schema", "type": "record", "fields": [ { "name":"string1", "type":"string" }, { "name":"string2", "type":"string" } ] }'); --Use personalized custom SerDe(we may need to `ADD JAR xxx.jar` first to ensure we can find the serde_class, --or you may run into `CLASSNOTFOUND` exception) ADD JAR /tmp/hive_serde_example.jar; CREATE EXTERNAL TABLE family (id INT, name STRING) ROW FORMAT SERDE 'com.ly.spark.serde.SerDeExample' STORED AS INPUTFORMAT 'com.ly.spark.example.serde.io.SerDeExampleInputFormat' OUTPUTFORMAT 'com.ly.spark.example.serde.io.SerDeExampleOutputFormat' LOCATION '/tmp/family/'; --Use `CLUSTERED BY` clause to create bucket table without `SORTED BY` CREATE TABLE clustered_by_test1 (ID INT, AGE STRING) CLUSTERED BY (ID) INTO 4 BUCKETS STORED AS ORC --Use `CLUSTERED BY` clause to create bucket table with `SORTED BY` CREATE TABLE clustered_by_test2 (ID INT, NAME STRING) PARTITIONED BY (YEAR STRING) CLUSTERED BY (ID, NAME) SORTED BY (ID ASC) INTO 3 BUCKETS STORED AS PARQUET
CREATE TABLE语句使用现有表或视图的定义/元数据定义一个新表。
CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier USING data_source [ ROW FORMAT row_format ] [ STORED AS file_format ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ LOCATION path ]
-- Create table using an existing table CREATE TABLE Student_Dupli like Student; -- Create table like using a data source CREATE TABLE Student_Dupli like Student USING CSV; -- Table is created as external table at the location specified CREATE TABLE Student_Dupli like Student location '/root1/home'; -- Create table like using a rowformat CREATE TABLE Student_Dupli like Student ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ('owner'='xxxx');
TAG:create table