0: 设置系统登录相关
Master要执行
1
|
cat
$HOME/.
ssh
/id_rsa
.pub >> $HOME/.
ssh
/authorized_keys
|
如果用root用户
1
|
sed
-ri
's/^(PermitRootLogin ).*$/\1yes/'
/etc/ssh/sshd_config
|
编辑/etc/hosts
1
2
3
4
5
6
7
8
9
10
11
|
127.0.0.1 localhost
# 别把 spark1 放在这
192.168.100.25 spark1
#spark1 is Master
192.168.100.26 spark2
192.168.100.27 spark3
127.0.1.1 ubuntu
# The following lines are desirable for IPv6 capable hosts
::1 localhost ip6-localhost ip6-loopback
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
|
如果把 spark1 放在/etc/hosts第一行, 会发现在slave 有下面的错误
1
|
org.apache.hadoop.ipc.Client: Retrying connect to server: spark1
/192
.168.100.25:9000. Already tried 0
time
(s)
|
然后在spark1 运行
1
2
|
ss -lnt
LISTEN 0 128 localhost:9000
|
会发现监听的是本地. 删除 hosts中的相关文本重新启动hadoop,解决问题
1: 安装java
可以直接apt-get
1
2
3
4
|
apt-get
install
python-software-properties -y
add-apt-repository ppa:webupd8team
/java
apt-get update
apt-get
install
oracle-java7-installer
|
或者下载
1
2
3
4
5
6
7
8
9
10
11
12
13
|
wget http:
//download
.oracle.com
/otn-pub/java/jdk/7u80-b15/jdk-7u80-linux-x64
.
tar
.gz
mkdir
/usr/lib/jvm
tar
xvf jdk-7u80-linux-x64.
tar
.gz
mv
jdk1.7.0_80
/usr/lib/jvm
# 配置相关路径
update-alternatives --
install
"/usr/bin/java"
"java"
"/usr/lib/jvm/jdk1.7.0_80/bin/java"
1
update-alternatives --
install
"/usr/bin/javac"
"javac"
"/usr/lib/jvm/jdk1.7.0_80/bin/javac"
1
update-alternatives --
install
"/usr/bin/javaws"
"javaws"
"/usr/lib/jvm/jdk1.7.0_80/bin/javaws"
1
update-alternatives --config java
# 验证一下
java -version
javac -version
javaws -version
|
添加环境变量
1
2
3
4
5
6
|
cat
>>
/etc/profile
<<EOF
export
JAVA_HOME=
/usr/lib/jvm/jdk1
.7.0_80
export
JRE_HOME=
/usr/lib/jvm/jdk1
.7.0_80
/jre
export
CLASSPATH=.:$CLASSPATH:$JAVA_HOME
/lib
:$JRE_HOME
/lib
export
PATH=$PATH:$JAVA_HOME
/bin
:$JRE_HOME
/bin
EOF
|
2: 安装 hadoop
1
2
3
4
|
tar
xvf hadoop-2.7.3.
tar
.gz
mv
hadoop-2.7.3
/usr/local/hadoop
cd
/usr/local/hadoop
mkdir
-p hdfs/{data,name,tmp}
|
添加环境变量
1
2
3
4
|
cat
>>
/etc/profile
<<EOF
export
HADOOP_HOME=
/usr/local/hadoop
export
PATH=$PATH:$HADOOP_HOME
/bin
EOF
|
编辑 hadoop-env.sh 文件
1
|
export
JAVA_HOME=
/usr/lib/jvm/jdk1
.7.0_80
#只改了这一行
|
编辑 core-site.xml 文件
1
2
3
4
5
6
7
8
9
10
|
<configuration>
<property>
<name>fs.defaultFS<
/name
>
<value>hdfs:
//spark1
:9000<
/value
>
<
/property
>
<property>
<name>hadoop.tmp.
dir
<
/name
>
<value>
/usr/local/hadoop/hdfs/tmp
<
/value
>
<
/property
>
<
/configuration
>
|
编辑 hdfs-site.xml 文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
<configuration>
<property>
<name>dfs.namenode.name.
dir
<
/name
>
<value>
/usr/local/hadoop/hdfs/name
<
/value
>
<
/property
>
<property>
<name>dfs.datanode.data.
dir
<
/name
>
<value>
/usr/local/hadoop/hdfs/data
<
/value
>
<
/property
>
<property>
<name>dfs.replication<
/name
>
<value>3<
/value
>
<
/property
>
<
/configuration
>
|
编辑 mapred-site.xml 文件
1
2
3
4
5
6
|
<configuration>
<property>
<name>mapreduce.framework.name<
/name
>
<value>yarn<
/value
>
<
/property
>
<
/configuration
>
|
编辑 yarn-site.xml 文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
<configuration>
<property>
<name>yarn.nodemanager.aux-services<
/name
>
<value>mapreduce_shuffle<
/value
>
<
/property
>
<property>
<name>yarn.resourcemanager.
hostname
<
/name
>
<value>spark1<
/value
>
<
/property
>
<!--property>
别添加这个属性,添加了可能出现下面的错误:
Problem binding to [spark1:0] java.net.BindException: Cannot assign requested address
<name>yarn.nodemanager.
hostname
<
/name
>
<value>spark1<
/value
>
<
/property--
>
<
/configuration
>
|
上面相关文件的具体属性及值在官网查询:
https://hadoop.apache.org/docs/r2.7.3/
编辑 masters 文件
1
|
echo
spark1 > masters
|
编辑 slaves 文件
1
2
3
|
spark1
spark2
spark3
|
安装好后,使用rsync 把相关目录及/etc/profile同步过去即可
启动hadoop dfs
1
|
.
/sbin/start-dfs
.sh
|
初始化文件系统
1
|
hadoop namenode -
format
|
启动 yarn
1
|
.
/sbin/start-yarn
.sh
|
检查spark1相关进程
1
2
3
4
5
6
7
|
root@spark1:
/usr/local/spark/conf
# jps
1699 NameNode
8856 Jps
2023 SecondaryNameNode
2344 NodeManager
1828 DataNode
2212 ResourceManager
|
spark2 spark3 也要类似下面的运程
1
2
3
4
|
root@spark2:
/tmp
# jps
3238 Jps
1507 DataNode
1645 NodeManager
|
可以打开web页面查看
1
|
http:
//192
.168.100.25:50070
|
测试hadoop
1
2
3
4
|
hadoop fs -
mkdir
/testin
hadoop fs -put ~
/str
.txt
/testin
cd
/usr/local/hadoop
hadoop jar .
/share/hadoop/mapreduce/hadoop-mapreduce-examples-2
.7.3.jar wordcount
/testin/str
.txt testout
|
结果如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
hadoop jar .
/share/hadoop/mapreduce/hadoop-mapreduce-examples-2
.7.3.jar wordcount
/testin/str
.txt testout
17
/02/24
11:20:59 INFO client.RMProxy: Connecting to ResourceManager at spark1
/192
.168.100.25:8032
17
/02/24
11:21:01 INFO input.FileInputFormat: Total input paths to process : 1
17
/02/24
11:21:01 INFO mapreduce.JobSubmitter: number of splits:1
17
/02/24
11:21:02 INFO mapreduce.JobSubmitter: Submitting tokens
for
job: job_1487839487040_0002
17
/02/24
11:21:06 INFO impl.YarnClientImpl: Submitted application application_1487839487040_0002
17
/02/24
11:21:06 INFO mapreduce.Job: The url to track the job: http:
//spark1
:8088
/proxy/application_1487839487040_0002/
17
/02/24
11:21:06 INFO mapreduce.Job: Running job: job_1487839487040_0002
17
/02/24
11:21:28 INFO mapreduce.Job: Job job_1487839487040_0002 running
in
uber mode :
false
17
/02/24
11:21:28 INFO mapreduce.Job: map 0% reduce 0%
17
/02/24
11:22:00 INFO mapreduce.Job: map 100% reduce 0%
17
/02/24
11:22:15 INFO mapreduce.Job: map 100% reduce 100%
17
/02/24
11:22:17 INFO mapreduce.Job: Job job_1487839487040_0002 completed successfully
17
/02/24
11:22:17 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes
read
=212115
FILE: Number of bytes written=661449
FILE: Number of
read
operations=0
FILE: Number of large
read
operations=0
FILE: Number of write operations=0
HDFS: Number of bytes
read
=377966
HDFS: Number of bytes written=154893
HDFS: Number of
read
operations=6
HDFS: Number of large
read
operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-
local
map tasks=1
Total
time
spent by all maps
in
occupied slots (ms)=23275
Total
time
spent by all reduces
in
occupied slots (ms)=11670
Total
time
spent by all map tasks (ms)=23275
Total
time
spent by all reduce tasks (ms)=11670
Total vcore-milliseconds taken by all map tasks=23275
Total vcore-milliseconds taken by all reduce tasks=11670
Total megabyte-milliseconds taken by all map tasks=23833600
Total megabyte-milliseconds taken by all reduce tasks=11950080
Map-Reduce Framework
Map input records=1635
Map output records=63958
Map output bytes=633105
Map output materialized bytes=212115
Input
split
bytes=98
Combine input records=63958
Combine output records=14478
Reduce input
groups
=14478
Reduce shuffle bytes=212115
Reduce input records=14478
Reduce output records=14478
Spilled Records=28956
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC
time
elapsed (ms)=429
CPU
time
spent (ms)=10770
Physical memory (bytes) snapshot=455565312
Virtual memory (bytes) snapshot=1391718400
Total committed heap usage (bytes)=277348352
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=377868
File Output Format Counters
Bytes Written=154893
|
3: 安装 scala
1
2
|
tar
xvf scala-2.11.8.tgz
mv
scala-2.11.8
/usr/local/scala
|
添加环境变量
1
2
3
4
|
cat
>>
/etc/profile
<<EOF
export
SCALA_HOME=
/usr/local/scala
export
PATH=$PATH:$SCALA_HOME
/bin
EOF
|
测试
1
2
3
|
source
/etc/profile
scala -version
Scala code runner version 2.11.8 -- Copyright 2002-2016, LAMP
/EPFL
|
4: 安装 spark
1
2
|
tar
xvf spark-2.1.0-bin-hadoop2.7.tgz
mv
spark-2.1.0-bin-hadoop2.7
/usr/local/spark
|
添加环境变量
1
2
3
4
5
|
cat
>>
/etc/profile
<<EOF
export
SPARK_HOME=
/usr/local/spark
export
PATH=$PATH:$SPARK_HOME
/bin
export
LD_LIBRARY_PATH=$HADOOP_HOME
/lib/native
EOF
|
1
2
3
|
export
LD_LIBRARY_PATH=$HADOOP_HOME
/lib/native
#这一条不添加的话在运行 spark-shell 时会出现下面的错误
NativeCodeLoader: Unable to load native-hadoop library
for
your platform... using
builtin
-java classes where applicable
|
编辑 spark-env.sh
1
2
|
SPARK_MASTER_HOST=spark1
HADOOP_CONF_DIR=
/usr/locad/hadoop/etc/hadoop
|
编辑 slaves
1
2
3
|
spark1
spark2
spark3
|
启动 spark
1
|
.
/sbin/start-all
.sh
|
此时在spark1上运行jps应该如下, 多了 Master 和 Worker
1
2
3
4
5
6
7
8
9
|
root@spark1:
/usr/local/spark/conf
# jps
1699 NameNode
8856 Jps
7774 Master
2023 SecondaryNameNode
7871 Worker
2344 NodeManager
1828 DataNode
2212 ResourceManager
|
spark2 和 spark3 则多了 Worker
1
2
3
4
5
|
root@spark2:
/tmp
# jps
3238 Jps
1507 DataNode
1645 NodeManager
3123 Worker
|
可以打开web页面查看
1
|
http:
//192
.168.100.25:8080/
|
运行 spark-shell
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
root@spark1:
/usr/local/spark/conf
# spark-shell
Using Spark's default log4j profile: org
/apache/spark/log4j-defaults
.properties
Setting default log level to
"WARN"
.
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
17
/02/24
11:55:46 WARN SparkContext: Support
for
Java 7 is deprecated as of Spark 2.0.0
17
/02/24
11:56:17 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
Spark context Web UI available at http:
//192
.168.100.25:4040
Spark context available as
'sc'
(master =
local
[*], app
id
=
local
-1487908553475).
Spark session available as
'spark'
.
Welcome to
____ __
/ __
/__
___ _____/
/__
_\ \/ _ \/ _ `/ __/ '_/
/___/
.__/\_,_
/_/
/_/
\_\ version 2.1.0
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_80)
Type
in
expressions to have them evaluated.
Type :help
for
more
information.
scala> :help
|
此时可以打开spark 查看
1
|
http:
//192
.168.100.25:4040
/environment/
|
spark 测试
1
2
3
|
run-example org.apache.spark.examples.SparkPi
17
/02/28
11:17:20 INFO DAGScheduler: Job 0 finished: reduce at SparkPi.scala:38, took 3.491241 s
Pi is roughly 3.1373756868784346
|
至此完成.
本文转自 nonono11 51CTO博客,原文链接:http://blog.51cto.com/abian/1900868,如需转载请自行联系原作者