Install Hadoop
Download JDK
JDK 8u161 download page # tar xvfz jdk-8u161-linux-x64.tar.gz
# cp -r jdk1.8.0_161/ /usr/local
java softlink 설정
# cd /usr/bin
# ln -s /usr/local/jdk1.8.0_161/bin/java java
profile 설정 - vi /etc/profile : PATH 설정
JAVA_HOME=/usr/local/jdk1.8.0_161
export JAVA_HOME
HADOOP_HOME=/usr/local/hadoop-1.2.1
export HADOOP_HOME
CLASSPATH=/usr/local/jdk1.8.0_161/lib
export CLASSPATH
PATH=$HADOOP_HOME/bin:$JAVA_HOME/bin:$PATH
하둡 환경설정
# cd usr/local/hadoop-1.2.1/conf
# vi hadoop-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_161
export HADOOP_HOME_WARN_SUPRESS="TRUE"
# vi core-site.txt
IP-ADDRESS : namenode의 IP Address or Hostname
window, linux Host 편집
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://IP-ADDRESS:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop-1.2.1/tmp</value>
</property>
</configuration>
hdfs-site.xml
# vi hdfs-site.xml
50070 : namenode의 메타정보를 볼 수 있는 페이지
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/usr/local/hadoop-1.2.1/data</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/usr/local/hadoop-1.2.1/name</value>
</property>
</configuration>
MapReduce 설정
# vi mapred_site.txt
Job tracker에 대한 설정
Map reduce가 job tracker 에 요청하는 Port(9001)를 열어두는 것
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>NameNode-IP-ADDRESS:9001</value>
</property>
</configuration>
하둡 환경설정 적용
# . /etc/profile
Hadoop format
# hadoop namenode -format
Hadoop 실행 , 실행 중인 Java process 확인
# start-all.sh
# jps
- http://NAMENODE-IP-ADDRESS:50070 로 접속
- 가상 분산 모드 설정
(Hadoop Pseudo distributed mode configuration)
- Generate SSH KEY
접속할 때 암호 물어보지 않도록, SSH Key생성 후 authorized_keys 복사 key generate 할 때 hadoop 실행 중이면 안되기 때문에 Hadoop 중단
# stop-all.sh
# ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
# cd .ssh
자기 자신에게 접속 가능하도록 인증키 복사
# cat id_dsa.pub >> authorized_keys
- 완전 분산 모드 설정 ( Hadoop Fully distributed mode configuration )
[master] # cd .ssh
[master] # ssh-copy-id
[master] # ssh-copy-id -i id_dsa.pub root@slave1
[master] # ssh-copy-id -i id_dsa.pub root@slave2
[master] # ssh-copy-id -i id_dsa.pub root@slave3
Slaves에 환경설정파일 배포
[master] # scp /etc/profile root@slave1:/etc/profile
[master] # scp -r /usr/local/hadoop-1.2.1/ root@slave1:/usr/local
[master] # scp -r /usr/local/hadoop-1.2.1/ root@slave2:/usr/local
[master] # scp -r /usr/local/hadoop-1.2.1/ root@slave3:/usr/local
hdfs-site.xml
# vi hdfs-site.xml
50070 : namenode의 메타정보를 볼 수 있는 페이지 Data 복제본
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.http.address</name>
<value>NAMENODE-IP-ADDRESS:50070</value>
</property>
<property>
<name>dfs.secondary.http.address</name>
<value>SECONDARYNODE-IP-ADDRESS:50090</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/usr/local/hadoop-1.2.1/data</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/usr/local/hadoop-1.2.1/name</value>
</property>
</configuration>
Install HIVE
1. MariaDB 설치 (Install)
MariaDB installation file download
http://www.mariadb.org/
- 다운로드 폴더에 mariaDB 설치파일 다운
MariaDB-10.0.15-centos7_0-x86_64-client.rpm MariaDB-10.0.15-centos7_0-x86_64-common.rpm MariaDB-10.0.15-centos7_0-x86_64-server.rpm
- 압축풀기, 설치하기
# cd 다운로드/
# yum -y remove mariadb-libs
# yum -y localinstall Maria*
HIVE가 접근할 수 있는 hive 계정 생성
# systemctl restart mysql
# systemctl status mysql
서비스 상시가동
# chkconfig mysql on
# mysqladmin -u root password '111111'
# mysql -u root -p
MariaDB [(none)]> use mysql
hive 계정 권한 변경
localhost 에서 들어오는 모든 접근에 hive 계정 권한을 열어준다
MariaDB [mysql]> grant all privileges on *.* to hive@'localhost' identified by '111111';
모든ip에서 들어오는 접근에 hive 계정 권한을 주겠다.
MariaDB [mysql]> grant all privileges on *.* to hive@'%' identified by '111111';
Hive 계정에서 사용할 database 생성 MariaDB [mysql]> create database hive_db;
MariaDB [mysql]> commit;
MariaDB [mysql]> show databases;
MariaDB [(none)]> use hive_db
Database changed
MariaDB [hive_db]>
1. Hive 설치
Hive다운로드
# tar xvfz apache-hive-1.0.1-bin.tar.gz
# cp -r apache-hive-1.0.1-bin /usr/local/hive-1.0.1
2. profile
# vi /etc/profile
HIVE_HOME=/usr/local/hive-1.0.1
export HIVE_HOME
PATH=$HADOOP_HOME/bin:$JAVA_HOME/bin:$HIVE_HOME/bin:$PATH
. /etc/profile
3. Hive-MariaDB connector
MariaDB library 를 hive에서 사용할 수 있도록 lib 파일에 넣어준다
mariadb-java-client-1.3.5.jar
# cd 다운로드
# cp mariadb-java-client-1.3.5.jar /usr/local/hive-1.0.1/lib
4. hive가 사용할 하둡내부의 공간
- hadoop data : tmp 폴더 권한을 열어줘야 hive 실행 가능
# hadoop dfs -mkdir /tmp/hive
# hadoop dfs -chmod 777 /tmp
# hadoop dfs -chmod 777 /tmp/hive
# hive
5. Database Structure 저장할 폴더 지정
- warehouse 안만들면 현재폴더에에 db_ 생성됌
# hadoop dfs -mkdir /user/root/warehouse
# hadoop dfs -chmod 777 /user/root/warehouse
6. Database Structure 저장할 폴더 지정
# cd /usr/local/hive-1.0.1/conf/
# touch hive-site.xml
# vi hive-site.xml
<configuration>
<property>
<name>hive.metastore.local</name>
<value>true</value>
<description>controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mariadb://localhost:3306/hive_db?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.mariadb.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>111111</value>
<description>password to use against metastore database</description>
</property>
</configuration>
7. Hadoop 실행 -> Hive 실행
# start-all.sh
# hive
Eclipse - Hadoop 연결하기
Database Server : Hadoop 가상분산시스템 + (Hadoop Eco System)
Hive를 이용해 Hadoop과 Java project를 연동
Java project에서 Hive SQL 로 Data 요청 - Hive에서 Hadoop으로 MapReduce 요청하는 방식
Java Project : windows OS 에서 Eclipse 를 사용해 프로젝트 생성
Linux OS 에서 Hadoop 준비하기
- Hadoop Server 실행
Run the Linux OS on VMware.
Open a Linux terminal and run Hadoop.
Verifying the execution of a Java process with "jps"
# start-all.sh
# jps
# hive
사용할 데이터
 |
경기도에 있는 음식점정보 (까페) |
Field Delimited ',' Line Delimited '\n'
CREATE TABLE cafe (
City STRING,
PlaceName String,
PermitDate Date,
BusinessStatus String,
MultipleFacilities String,
TotalSize Double,
HygieneIndustry String,
HygieneName String,
NumberAddress String,
StreetAddress String,
Zipcode Int,
Latitude Double,
Longitude Double
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
sed -e '1d' cafeUTF.csv > cafe.csv
데이터가 " "로 둘러싸여 있는 경우
현재 폴더에서 이름으로 검색, 데이터에 "를 제거
find . -name airports.csv -exec perl -p -i -e 's/"//g' {} \;
 |
Before delete first row |
 |
After delete first row |
load data local
inpath '/root/csv/cafe.csv'
overwrite into table cafe;
# hive --service hiveserver2
Java Project 준비
- SLF4J
SLF4J(Simple Logging Facade for Java) : log4J, jdk, common_log 등을 통합해서 사용가능한 logging Framework slf4j 다운로드
- jar 파일 다운로드
jar 다운로드
- Java project setting
Project > Properties > Java build path > Add External Jars
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import org.json.simple.JSONArray;
public class HiveTest {
public static void main(String[] args) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn = DriverManager.getConnection(
"jdbc:hive2://192.168.111.101:10000/default", "root", "111111");
Statement stmt = conn.createStatement();
String cafe =
"select city, placename from cafe LIMIt 2";
ResultSet rs = stmt.executeQuery(cafe);
JSONArray ja = new JSONArray();
while (rs.next()) {
JSONArray data = new JSONArray();
data.add(rs.getString(1));
data.add(rs.getString(2));
ja.add(data);
}
System.out.println(ja.toJSONString());
System.out.println("Success....");
conn.close();
}
}