greenplum 外部表
第一次听到外部表,后来查了一下,oracle也有外部表(孤陋寡闻)
greenplum 外部表:一张表的数据指向数据库之外的数据文件
外部表需要指定gpfdist的ip和端口,还有详细的目录地址,其中文件名支持通配符。可以编写多个gpfdist的地址,但是不能超过总的segment数
tutorial=> \h create external
Command: CREATE EXTERNAL TABLE
Description: define a new external table
Syntax:
CREATE [READABLE] EXTERNAL TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION ('file://seghost[:port]/path/file' [, ...])
| ('gpfdist://filehost[:port]/file_pattern[#transform]'
| ('gpfdists://filehost[:port]/file_pattern[#transform]'
[, ...])
| ('gphdfs://hdfs_host[:port]/path/file')
FORMAT 'TEXT'
[( [HEADER]
[DELIMITER [AS] 'delimiter' | 'OFF']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CSV'
[( [HEADER]
[QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE NOT NULL column [, ...]]
[ESCAPE [AS] 'escape']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'encoding' ]
[ [LOG ERRORS INTO error_table] SEGMENT REJECT LIMIT count
[ROWS | PERCENT] ]
CREATE [READABLE] EXTERNAL WEB TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION ('http://webhost[:port]/path/file' [, ...])
| EXECUTE 'command' [ON ALL
| MASTER
| number_of_segments
| HOST ['segment_hostname']
| SEGMENT segment_id ]
FORMAT 'TEXT'
[( [HEADER]
[DELIMITER [AS] 'delimiter' | 'OFF']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CSV'
[( [HEADER]
[QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE NOT NULL column [, ...]]
[ESCAPE [AS] 'escape']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'encoding' ]
[ [LOG ERRORS INTO error_table] SEGMENT REJECT LIMIT count
[ROWS | PERCENT] ]
CREATE WRITABLE EXTERNAL TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION('gpfdist://outputhost[:port]/filename[#transform]'
| ('gpfdists://outputhost[:port]/file_pattern[#transform]'
[, ...])
| ('gphdfs://hdfs_host[:port]/path')
FORMAT 'TEXT'
[( [DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF'] )]
| 'CSV'
[([QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE QUOTE column [, ...]] ]
[ESCAPE [AS] 'escape'] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'write_encoding' ]
[ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]
CREATE WRITABLE EXTERNAL WEB TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
EXECUTE 'command' [ON ALL]
FORMAT 'TEXT'
[( [DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF'] )]
| 'CSV'
[([QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE QUOTE column [, ...]] ]
[ESCAPE [AS] 'escape'] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'write_encoding' ]
[ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]
tutorial=>
创建外部表流程
1.文件服务器中启动gpfdist服务
[gpadmin@master admin]$ nohup /usr/local/greenplum-db/bin/gpfdist -d /home/admin/ -p 8888> /tmp/gpfdist.log &
[1] 9928
[gpadmin@master admin]$ cat /tmp/gpfdist.log
Serving HTTP on port 8888, directory /home/admin/
文件服务器路径已经要准确,否则,之前创建的外部表就会出错
2.创建语句
CREATE EXTERNAL TABLE public.test001_ext
(
id int,
name varchar(100)
)
LOCATION (
'gpfdist://192.168.1.80:8888/gpextdata/d.txt'
)
FORMAT 'text' (delimiter ',' null '' escape 'OFF')
ENCODING 'GB18030';
tutorial=> CREATE EXTERNAL TABLE public.test001_ext
( id int, name varchar(100) )
LOCATION ( 'gpfdist://192.168.1.80:8888/gpextdata/d.txt' )
FORMAT 'text' (delimiter ',' null '' escape 'OFF')
ENCODING 'GB18030';
CREATE EXTERNAL TABLE
tutorial=> select * from test001_ext;
ERROR: http response code 404 from gpfdist (gpfdist://192.168.1.80:8888/gpextdata/d.txt): HTTP/1.0404 file not found (url.c:326) (seg0 slice1 slave1:40000 pid=1847) (cdbdisp.c:1477)
文件路径有问题
tutorial=> CREATE EXTERNAL TABLE public.test001_ext
( id int, name varchar(100) ) LOCATION ( 'gpfdist://192.168.1.80:8888/gpextdata/d.txt' ) FORMAT 'text' (delimiter ',' null '' escape 'OFF') ENCODING 'GB18030';CREATE EXTERNAL TABLE
tutorial=> select * from test001_ext;
id | name
----+------
1 | 2
3 | 4
(2 rows)
CREATE EXTERNAL TABLE public.test001_ext
(
id int,
name varchar(100)
)
LOCATION (
'gpfdist://192.168.1.80:8888/gpextdata/d.txt'
)
FORMAT 'text' (delimiter ',' null '' escape 'OFF')
ENCODING 'GB18030';
tutorial=> select * from test001_ext;
id | name
----+------
1 | 2
3 | 4
(2 rows)
可以编写多个gpfdist的地址,但是不能超过总的segment数
tutorial=> CREATE EXTERNAL TABLE public.test002_ext
tutorial-> ( id int ,name varchar(100))
tutorial-> LOCATION (
tutorial(> 'gpfdist://192.168.1.80:8888/gpextdata/d.txt',
tutorial(> 'gpfdist://192.168.1.80:8888/gpextdata/b.txt',
tutorial(> 'gpfdist://192.168.1.80:8888/gpextdata/c.txt')
tutorial-> FORMAT 'text' (delimiter ',' null '' escape 'OFF')
tutorial-> ENCODING 'GB18030';
CREATE EXTERNAL TABLE
tutorial=> select * from test002_ext;
ERROR: There are more external files (URLs) than primary segments that can read them. Found 3 URLs
and 2 primary segments