Hadoop distributed file system installation on FreeBSD.

Install Hadoop on two Nodes.

Node1:

–          Hostname : svr207.nganthao.com

–         Ip address: 192.168.1.207

–         Role: Namenode, Datanode.

Node2:

–         Hostname: svr217.nganthao.com

–         Ip address: 192.168.1.217

–          Role: Datanode.

1         Install and configure Hadoop on Node1

1.1       Install Hadoop2

# cd /usr/ports/devel/hadoop2

# make install clean

1.2       Configure passwordless ssh

#su nutch

nutch@svr207:~ % pwd

/home/nutch

1.2.1       Create authorized_keys for localhost.

% ssh-keygen -t rsa

% cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

1.2.2       Copy authorized_keys to other datanode.

% scp ~/.ssh/authorized_keys nutch@[IP of other datanode]:/home/nutch/.ssh/

1.3       Create core-site.xml

 # vi /usr/local/etc/hadoop/core-site.xml

<configuration>

<property>

<name>fs.defaultFS</name>

<value>hdfs://svr207.nganthao.com:9000</value>

</property>

<property>

<name>fs.trash.interval</name>

<value>1440</value>

</property>

<property>

<name>fs.trash.checkpoint.interval</name>

<value>1440</value>

</property>

<property>

<name>hadoop.tmp.dir</name>

<value>/home/nutch/tmp</value>

</property></configuration>

1.4       Create hdfs-site.xml

# vi /usr/local/etc/hadoop/hdfs-site.xml

<configuration>

<property>

<name>dfs.replication</name>

<value>1</value>

</property>

<property>

<name>dfs.namenode.name.dir</name>

<value>file:///home/nutch/filesystem/name/</value>

</property>

<property>

<name>dfs.datanode.data.dir</name>

<value>file:///home/nutch/filesystem/data/</value>

</property>

</configuration>

1.5       Create hadoop-env.sh

# cp /usr/local/share/examples/hadoop/conf/hadoop-env.sh /usr/local/etc/hadoop

# vi /usr/local/etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr/local/openjdk7

1.6       Create slaves file.

# vi /usr/local/etc/hadoop/slaves

localhost

svr217.nganthao.com

1.7       Create mapred-site.xml

# vi /usr/local/etc/hadoop/mapred-site.xml

<configuration>

<property>

<name>mapreduce.framework.name</name>

<value>yarn</value>

</property>

 

<property>

<name>mapred.job.tracker</name>

<value>svr207.nganthao.com:9001</value>

</property>

<property>

<name>mapred.map.tasks</name>

<value>2</value>

<description>

define mapred.map tasks to be number of slave hosts

</description>

</property>

 

<property>

<name>mapred.reduce.tasks</name>

<value>2</value>

<description>

define mapred.reduce tasks to be number of slave hosts

</description>

</property>

 

<property>

<name>mapred.system.dir</name>

<value>/home/nutch/filesystem/mapreduce/system</value>

</property>

 

<property>

<name>mapred.local.dir</name>

<value>/home/nutch/filesystem/mapreduce/local</value>

</property>

</configuration>

1.8       Create mapred-env.sh

# vi /usr/local/etc/hadoop/mapred-env.sh

export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000

export HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA

1.9       Create some necessary folders.

# mkdir /home/nutch

# mkdir /home/nutch/filesystem

# mkdir /home/nutch/filesystem/data /home/nutch/filesystem/name

# mkdir /home/nutch/filesystem/mapreduce

# mkdir /home/nutch/filesystem/mapreduce/local

1.10  Create yarn-site.xml.

# vi /usr/local/etc/hadoop/yarn-site.xml

<configuration>

<!– Site specific YARN configuration properties –>

<property>

<name>yarn.nodemanager.aux-services</name>

<value>mapreduce_shuffle</value>

</property>

<property>

<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>

<value>org.apache.hadoop.mapred.ShuffleHandler</value>

</property>

</configuration>

1.11  Create file yarn-env.sh

# cp /usr/local/share/examples/hadoop/conf/yarn-env.sh /usr/local/etc/hadoop/

2         Install and configure Hadoop on Node2

2.1       Install Hadoop2

# cd /usr/ports/devel/hadoop2

# make install clean

2.2       Create core-site.xml

 # vi /usr/local/etc/hadoop/core-site.xml

<configuration>

<property>

<name>fs.defaultFS</name>

<value>hdfs://svr207.nganthao.com:9000</value>

</property>

<property>

<name>fs.trash.interval</name>

<value>1440</value>

</property>

<property>

<name>fs.trash.checkpoint.interval</name>

<value>1440</value>

</property>

<property>

<name>hadoop.tmp.dir</name>

<value>/home/nutch/tmp</value>

</property></configuration>

2.3       Create hdfs-site.xml

# vi /usr/local/etc/hadoop/hdfs-site.xml

<configuration>

<property>

<name>dfs.replication</name>

<value>1</value>

</property>

<property>

<name>dfs.namenode.name.dir</name>

<value>file:///home/nutch/filesystem/name/</value>

</property>

<property>

<name>dfs.datanode.data.dir</name>

<value>file:///home/nutch/filesystem/data/</value>

</property>

</configuration>

2.4       Create hadoop-env.sh

# cp /usr/local/share/examples/hadoop/conf/hadoop-env.sh /usr/local/etc/hadoop

# vi /usr/local/etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr/local/openjdk7

2.5       Create mapred-site.xml

# vi /usr/local/etc/hadoop/mapred-site.xml

<configuration>

<property>

<name>mapreduce.framework.name</name>

<value>yarn</value>

</property>

 

<property>

<name>mapred.job.tracker</name>

<value>svr207.nganthao.com:9001</value>

</property>

<property>

<name>mapred.map.tasks</name>

<value>2</value>

<description>

define mapred.map tasks to be number of slave hosts

</description>

</property>

 

<property>

<name>mapred.reduce.tasks</name>

<value>2</value>

<description>

define mapred.reduce tasks to be number of slave hosts

</description>

</property>

 

<property>

<name>mapred.system.dir</name>

<value>/home/nutch/filesystem/mapreduce/system</value>

</property>

 

<property>

<name>mapred.local.dir</name>

<value>/home/nutch/filesystem/mapreduce/local</value>

</property>

</configuration>

2.6       Create mapred-env.sh

# vi /usr/local/etc/hadoop/mapred-env.sh

export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000

export HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA

 

2.7       Create yarn-site.xml.

# vi /usr/local/etc/hadoop/yarn-site.xml

<configuration>

<!– Site specific YARN configuration properties –>

<property>

<name>yarn.nodemanager.aux-services</name>

<value>mapreduce_shuffle</value>

</property>

<property>

<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>

<value>org.apache.hadoop.mapred.ShuffleHandler</value>

</property>

</configuration>

2.8       Create file yarn-env.sh

# cp /usr/local/share/examples/hadoop/conf/yarn-env.sh /usr/local/etc/hadoop/

3         Create some folder and start hadoop on master node.

3.1       Create nutch user.

# pw user add nutch  -g hadoop

3.2       Create folders.

# mkdir /home/nutch

# mkdir /home/nutch/tmp

# mkdir /home/nutch/filesystem

# mkdir /home/nutch/filesystem/name

# mkdir /home/nutch/filesystem/data

# chown –R nutch:hadoop /home/nutch

3.3       Start hadoop automatically.

# echo ‘datanode_enable=”YES”’          >> /etc/rc.conf

# echo ‘datanode_user=”nutch” ‘          >> /etc/rc.conf

# echo ‘namenode_enable=”YES”’       >> /etc/rc.conf

# echo ‘namenode_user=”nutch”’        >> /etc/rc.conf

# echo ‘resourcemanager_enable=”YES”’          >> /etc/rc.conf

# echo ‘resourcemanager_user=”nutch”’           >> /etc/rc.conf

# echo ‘secondarynamenode_enable=”Yes”’  >> /etc/rc.conf

# echo ‘secondarynamenode_user=”nutch”’   >> /etc/rc.conf

%  /usr/local/etc/rc.d/namenode start

%  /usr/local/etc/rc.d/datanode start

%  /usr/local/etc/rc.d/ resourcemanager start

%  /usr/local/etc/rc.d/ secondarynamenode start

4         Check Hadoop status

nutch@svr207:~ % jps

82691 NameNode

83106 Jps

82780 DataNode

82871 SecondaryNameNode

82930 ResourceManager

nutch@svr207:~ %

5         Install Nutch

5.1       Install apache-ant

# cd /usr/ports/devel/apache-ant

# make install clean

5.2       Download and build apache-nutch.

# fetch http://archive.apache.org/dist/nutch/1.7/apache-nutch-1.7-src.tar.gz

# tar zxf apache-nutch-1.7-src.tar.gz

# cd apache-nutch-1.7

# ant

#mkdir /home/nutch

#mkdir /home/nutch/search

# cp -R ./build/* /home/nutch/search

# cd /home/nutch/search

# mkdir urls

# vim urls/seed.txt

http://nutch.apache.org

http://apache.org

# hadoop fs -put urls urls

nutch@svr207:~ % hadoop fs -ls

Found 5 items

drwx——   – nutch supergroup          0 2015-03-26 00:00 .Trash

drwx——   – nutch supergroup          0 2015-03-24 06:15 urls

nutch@svr207:~ %  hadoop fs –ls hadoop jar apache-nutch-1.7.job org.apache.nutch.crawl.Crawl urls -dir crawl -depth 3 -topN 5

   

Be the first to comment

Leave a Reply