九、使用PigLatin语句分析数据:
创建员工表:load后加载数据,using后指定分隔符,as后指定包结构
emp = load '/scott/emp.csv' using PigStorage(',') as(empno:int,ename:chararray,job:chararray,mgr:int,hiredate:chararray,sal:int,comm:int,deptno:int); describe emp;
查看员工表:2
SQL:select * from emp; PL:emp0 = foreach emp generate *; dump emp0;
创建部门表:
dept = load '/scott/dept.csv' using PigStorage(',') as(deptno:int,dname:chararray,loc:chararray);
查看部门表:
SQL:select * from dept; PL:dept0 = foreach dept generate *; dump dept0;
查询员工号、员工名和薪水:
SQL:select empno,ename,sal from emp; PL:emp1 = foreach emp generate empno,ename,sal; dump emp1;
根据薪水对员工表排序:
SQL:select sal from emp order by sal; PL:emp2 = order emp by sal; dump emp2;
按照部门号对员工表分组并求每个部门中薪水的最大值:
SQL:select deptno,max(sal) from emp group by deptno; PL:emp3 = group emp by deptno; emp4 = foreach emp3 generate group,MAX(emp.sal); dump emp4;
查看10、20、30号部门的员工
SQL:select * from emp where deptno=10; select * from emp where deptno=20; select * from emp where deptno=30; PL:emp5 = filter emp by deptno==10; dump emp5; emp6 = filter emp by deptno==20; dump emp6; emp7 = filter emp by deptno==30; dump emp7;
多表查询,查询员工名和部门名:
SQL:select emp.ename,dept.dname from emp,dept where emp.deptno=dept.deptno; PL:emp8 = join emp by deptno,dept by deptno emp9 = foreach emp8 generate emp::ename,dept::dname; dump emp9;
内连接:
C = join A by id,B by id;
外连接:
左外连接:C = join A by id left outer,B by id; #以左侧数据为基准,只返回左侧有的数据
右外连接:C = join A by id right outer,B by id;#以右侧数据为基准,只返回右侧有的数据
全外连接:C = join A by id full outer, B by id;#两侧数据都返回
联合查询,查询10号部门和20号部门的员工:
SQL:select * from emp where deptno=10 union select * from dept where deptno=20; PL: emp10 = filter emp by deptno==10; emp11 = filter emp by deptno==20; emp12 = union emp10,emp11; 实现wordcount;
加载数据
mydata = load '/output/data2.txt' as (line:chararray);
将字符串分割成单词
words = foreach mydata generate flatten(TOKENIZE(line)) as word;
对单词分组
grpd = group words by word;
统计每组中单词数量
cntd = foreach grpd generate group,COUNT(words);
结果显示到屏幕上
dump cntd;
结果存储到HDFS上
store cntd into '/pig';