本教程由作者xhxClaud 3.7 Sonnet 共同创作完成。

本系统基于CentOS7 2009 版,其他版本系统可能存在各种问题,请不要轻易尝试,以免无谓浪费时间。适用于虚拟机环境或者物理机环境。推荐在VirtualBox 中安装虚拟机来完成。终端工具建议使用MobarXterm,文件传输工具使用WinSCP

本指南共分为11个部分,主要安装的大数据开发组件包括:

本指南所需要的软件下载链接为:https://pan.quark.cn/s/49707c42257e

基本上和大数据职业院校技能大赛江苏省省赛所要求的环境一致。大数据的开发环境对版本的依赖性较高,没有必要追求新版的组件,以避免出现各种莫名其妙的BUG。

0. 准备工作

首先,我们需要设置一些基本环境变量和目录结构。

# 创建应用程序安装目录
mkdir -p /opt/module
mkdir -p /opt/software

# 使用winscp工具将所需软件拷贝到/opt/software目录下,如果是Linux系统请使用scp命令

# 创建数据目录
mkdir -p /data


# 设置JAVA_HOME环境变量
echo 'export JAVA_HOME=/opt/module/jdk1.8.0' >> /etc/profile
echo 'export PATH=$PATH:$JAVA_HOME/bin' >> /etc/profile


# 设置主机名和hosts
echo "master" > /etc/hostname
#假设本机地址为192.168.55.123
echo "192.168.55.123 master" >> /etc/hosts


# 刷新环境变量
source /etc/profile

1. 安装MySQL5.7

需要的离线安装包如下:

mysql-community-common-5.7.28-1.el7.x86_64.rpm
mysql-community-libs-5.7.28-1.el7.x86_64.rpm
mysql-community-client-5.7.28-1.el7.x86_64.rpm
mysql-community-server-5.7.28-1.el7.x86_64.rpm

将所需要的安装包放在一个文件夹下面,比如 mysqlhome 。为了避免逐一下载依赖,可以使用下面的命令

yum localinstall *.rpm -y

在联网的情况下自动下载相关依赖文件。

安装完成之后启动mysql服务

# 启动 MySQL 服务
systemctl start mysqld
systemctl enable mysqld

下面将root密码修改为“123456”。

# 获取临时 root 密码
sudo grep 'temporary password' /var/log/mysqld.log


# 登录 MySQL
mysql -uroot -p
# 编辑 MySQL 配置文件
vi /etc/my.cnf


# 添加以下内容到 [mysqld] 部分
[mysqld]
validate_password_policy=LOW
validate_password_length=6
validate_password_special_char_count=0
validate_password_mixed_case_count=0
validate_password_number_count=0

重新启动mysqld服务

systemctl restart mysqld
ALTER USER 'root'@'localhost' IDENTIFIED BY '123456';

开启远程访问

CREATE USER 'root'@'%' IDENTIFIED BY '123456';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;
FLUSH PRIVILEGES;
systemctl stop firewalld
systemctl disable firewalld

2. 安装JDK 1.8

cd /opt/software


# 解压JDK
tar -zxvf jdk-8u*.tar.gz -C /opt/module/
mv /opt/module/jdk1.8.0_* /opt/module/jdk1.8.0


# 验证安装
source /etc/profile
java -version

3. 安装Hadoop 3.1.3和Yarn 3.1.3

# 解压Hadoop
tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module/


# 配置环境变量
echo 'export HADOOP_HOME=/opt/module/hadoop-3.1.3' >> /etc/profile
echo 'export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin' >> /etc/profile
source /etc/profile


# 配置core-site.xml
cat > /opt/module/hadoop-3.1.3/etc/hadoop/core-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://master:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/data/hadoop/tmp</value>
    </property>
</configuration>
EOF


# 配置hdfs-site.xml
mkdir -p /data/hadoop/hdfs/{name,data}
cat > /opt/module/hadoop-3.1.3/etc/hadoop/hdfs-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
      <name>dfs.namenode.rpc-bind-host</name>
      <value>0.0.0.0</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/data/hadoop/hdfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/data/hadoop/hdfs/data</value>
    </property>
</configuration>
EOF


# 配置mapred-site.xml
cat > /opt/module/hadoop-3.1.3/etc/hadoop/mapred-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>
</configuration>
EOF


# 配置yarn-site.xml
cat > /opt/module/hadoop-3.1.3/etc/hadoop/yarn-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>master</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>
EOF


# 格式化HDFS
hdfs namenode -format


# 编辑 start-dfs.sh 和 stop-dfs.sh
vi /opt/module/hadoop-3.1.3/sbin/start-dfs.sh


# 在文件开头添加以下内容(#!/bin/bash 行之后)
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root


# 同样,编辑 /opt/module/hadoop-3.1.3/sbin/start-yarn.sh 和 /opt/module/hadoop-3.1.3/sbin/stop-yarn.sh 文件:




# 编辑 start-yarn.sh
vi /opt/module/hadoop-3.1.3/sbin/start-yarn.sh


# 在文件开头添加以下内容(#!/bin/bash 行之后)
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

#编辑stop-yarn.sh文件,同上。


# 生成密钥对,实现ssh免密访问
ssh-keygen
ssh-copy-id master
# 修改hadoop-env.sh
vi /opt/module/hadoop-3.1.3/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0



# 启动Hadoop
/opt/module/hadoop-3.1.3/sbin/start-dfs.sh
/opt/module/hadoop-3.1.3/sbin/start-yarn.sh

#修改hdfs目录权限,让其他用户可远程修改hdfs文件系统。生产环境不推荐

hdfs dfs chmod -R 777  /

4. 安装ZooKeeper 3.5.7

# 解压ZooKeeper
tar -zxvf apache-zookeeper-3.5.7-bin.tar.gz -C /opt/module/
mv /opt/module/apache-zookeeper-3.5.7-bin /opt/module/zookeeper-3.5.7


# 配置环境变量
echo 'export ZOOKEEPER_HOME=/opt/module/zookeeper-3.5.7' >> /etc/profile
echo 'export PATH=$PATH:$ZOOKEEPER_HOME/bin' >> /etc/profile
source /etc/profile


# 创建数据目录
mkdir -p /data/zookeeper


# 配置zoo.cfg
cat > /opt/module/zookeeper-3.5.7/conf/zoo.cfg << EOF
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/data/zookeeper
clientPort=2181
server.1=master:2888:3888
EOF


# 创建myid文件
echo "1" > /data/zookeeper/myid


# 启动ZooKeeper
/opt/module/zookeeper-3.5.7/bin/zkServer.sh start

5. 安装Hive 3.1.2

# 解压Hive
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/module/
mv /opt/module/apache-hive-3.1.2-bin /opt/module/hive-3.1.2


# 配置环境变量
echo 'export HIVE_HOME=/opt/module/hive-3.1.2' >> /etc/profile
echo 'export PATH=$PATH:$HIVE_HOME/bin' >> /etc/profile
source /etc/profile


# 创建Hive的HDFS目录
hadoop fs -mkdir -p /user/hive/warehouse
hadoop fs -mkdir -p /tmp/hive
hadoop fs -chmod g+w /user/hive/warehouse
hadoop fs -chmod g+w /tmp/hive


# 配置hive-site.xml
cat > /opt/module/hive-3.1.2/conf/hive-site.xml << EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://localhost:3306/metastore?createDatabaseIfNotExist=true&amp;useSSL=false</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>123456</value>
    </property>
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive/warehouse</value>
    </property>
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>master</value>
    </property>
    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
</configuration>
EOF


# 复制MySQL JDBC驱动到Hive的lib目录
cp mysql-connector-j-8.4.0.jar /opt/module/hive-3.1.2/lib/


# Guava版本冲突确认
# 查看 Hadoop 使用的 Guava 版本
find $HADOOP_HOME -name "guava*.jar"


# 查看 Hive 使用的 Guava 版本
find $HIVE_HOME -name "guava*.jar"


# 删除 Hive 的 Guava JAR
find $HIVE_HOME/lib -name "guava*.jar" -exec rm -f {} \;


# 复制 Hadoop 的 Guava JAR 到 Hive lib 目录
find $HADOOP_HOME -name "guava*.jar" -exec cp {} $HIVE_HOME/lib/ \;


# 初始化Hive元数据库
/opt/module/hive-3.1.2/bin/schematool -dbType mysql -initSchema


# 创建目录
hadoop fs -mkdir -p /tmp/hive
hadoop fs -chmod -R 777 /tmp/hive


# 启动Hive Metastore服务
nohup hive --service metastore &
# 启动HiveServer2
nohup hive --service hiveserver2  &

6. 安装Kafka 2.4.1

# 解压Kafka
tar -zxvf kafka_2.12-2.4.1.tgz -C /opt/module/
mv /opt/module/kafka_2.12-2.4.1 /opt/module/kafka-2.4.1


# 配置环境变量
echo 'export KAFKA_HOME=/opt/module/kafka-2.4.1' >> /etc/profile
echo 'export PATH=$PATH:$KAFKA_HOME/bin' >> /etc/profile
source /etc/profile


# 创建Kafka日志目录
mkdir -p /data/kafka-logs


# 修改Kafka配置
cat > /opt/module/kafka-2.4.1/config/server.properties << EOF
broker.id=0
listeners=PLAINTEXT://master:9092
num.network.threads=3
num.io.threads=8
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
socket.request.max.bytes=104857600
log.dirs=/data/kafka-logs
num.partitions=1
num.recovery.threads.per.data.dir=1
offsets.topic.replication.factor=1
transaction.state.log.replication.factor=1
transaction.state.log.min.isr=1
log.retention.hours=168
log.segment.bytes=1073741824
log.retention.check.interval.ms=300000
zookeeper.connect=master:2181
zookeeper.connection.timeout.ms=18000
group.initial.rebalance.delay.ms=0
EOF


# 启动Kafka
nohup /opt/module/kafka-2.4.1/bin/kafka-server-start.sh /opt/module/kafka-2.4.1/config/server.properties &

7. 安装Spark 3.1.1

# 解压Spark
tar -zxvf spark-3.1.1-bin-hadoop3.2.tgz -C /opt/module/
mv /opt/module/spark-3.1.1-bin-hadoop3.2 /opt/module/spark-3.1.1


# 配置环境变量
echo 'export SPARK_HOME=/opt/module/spark-3.1.1' >> /etc/profile
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> /etc/profile
source /etc/profile


# 配置spark-env.sh
cp /opt/module/spark-3.1.1/conf/spark-env.sh.template /opt/module/spark-3.1.1/conf/spark-env.sh
cat >> /opt/module/spark-3.1.1/conf/spark-env.sh << EOF
export JAVA_HOME=/opt/module/jdk1.8.0
export HADOOP_HOME=/opt/module/hadoop-3.1.3
export HADOOP_CONF_DIR=/opt/module/hadoop-3.1.3/etc/hadoop
export SPARK_MASTER_HOST=master
export SPARK_MASTER_PORT=7077
export SPARK_WORKER_CORES=1
export SPARK_WORKER_MEMORY=1g
EOF


# 配置spark-defaults.conf
cp /opt/module/spark-3.1.1/conf/spark-defaults.conf.template /opt/module/spark-3.1.1/conf/spark-defaults.conf
cat >> /opt/module/spark-3.1.1/conf/spark-defaults.conf << EOF
spark.master                     spark://master:7077
spark.eventLog.enabled           true
spark.eventLog.dir               hdfs://master:9000/spark-logs
spark.history.fs.logDirectory    hdfs://master:9000/spark-logs
EOF


# 创建Spark历史服务器日志目录
hadoop fs -mkdir -p /spark-logs


# 启动Spark
/opt/module/spark-3.1.1/sbin/start-all.sh
/opt/module/spark-3.1.1/sbin/start-history-server.sh

8. 安装Flink 1.14.0

# 解压Flink
tar -zxvf flink-1.14.0-bin-scala_2.12.tgz -C /opt/module/
mv /opt/module/flink-1.14.0 /opt/module/flink-1.14.0


# 配置环境变量
echo 'export FLINK_HOME=/opt/module/flink-1.14.0' >> /etc/profile
echo 'export PATH=$PATH:$FLINK_HOME/bin' >> /etc/profile
source /etc/profile


# 配置flink-conf.yaml
cat > /opt/module/flink-1.14.0/conf/flink-conf.yaml << EOF
jobmanager.rpc.address: master
jobmanager.rpc.port: 6123
jobmanager.memory.process.size: 1600m
taskmanager.memory.process.size: 1728m
taskmanager.numberOfTaskSlots: 1
parallelism.default: 1
EOF


# 配置workers
echo "master" > /opt/module/flink-1.14.0/conf/workers


# 启动Flink
/opt/module/flink-1.14.0/bin/start-cluster.sh

9. 安装HBase 2.2.3

# 解压HBase
tar -zxvf hbase-2.2.3-bin.tar.gz -C /opt/module/
mv /opt/module/hbase-2.2.3 /opt/module/hbase-2.2.3


# 配置环境变量
echo 'export HBASE_HOME=/opt/module/hbase-2.2.3' >> /etc/profile
echo 'export PATH=$PATH:$HBASE_HOME/bin' >> /etc/profile
source /etc/profile


# 配置hbase-env.sh
cat >> /opt/module/hbase-2.2.3/conf/hbase-env.sh << EOF
export JAVA_HOME=/opt/module/jdk1.8.0
export HBASE_MANAGES_ZK=false
EOF


# 配置hbase-site.xml
cat > /opt/module/hbase-2.2.3/conf/hbase-site.xml << EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>hbase.rootdir</name>
        <value>hdfs://master:9000/hbase</value>
    </property>
    <property>
        <name>hbase.cluster.distributed</name>
        <value>true</value>
    </property>
    <property>
        <name>hbase.zookeeper.quorum</name>
        <value>master</value>
    </property>
    <property>
        <name>hbase.zookeeper.property.clientPort</name>
        <value>2181</value>
    </property>
    <property>
        <name>hbase.unsafe.stream.capability.enforce</name>
        <value>false</value>
    </property>
</configuration>
EOF


# 启动HBase
/opt/module/hbase-2.2.3/bin/start-hbase.sh

10. 安装Flume 1.9.0

# 解压Flume
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/module/
mv /opt/module/apache-flume-1.9.0-bin /opt/module/flume-1.9.0


# 配置环境变量
echo 'export FLUME_HOME=/opt/module/flume-1.9.0' >> /etc/profile
echo 'export PATH=$PATH:$FLUME_HOME/bin' >> /etc/profile
source /etc/profile


# 配置flume-env.sh
cp /opt/module/flume-1.9.0/conf/flume-env.sh.template /opt/module/flume-1.9.0/conf/flume-env.sh
cat >> /opt/module/flume-1.9.0/conf/flume-env.sh << EOF
export JAVA_HOME=/opt/module/jdk1.8.0
EOF


# 创建一个简单的Flume配置示例
cat > /opt/module/flume-1.9.0/conf/example.conf << EOF
# example.conf: A single-node Flume configuration


# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = master
a1.sources.r1.port = 44444


# Describe the sink
a1.sinks.k1.type = logger


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
EOF

11. 启动脚本

下面是启动核心组件的脚本(不包含Hbase,可能存在冲突)

#!/bin/bash

# 大数据组件启动/关闭脚本
# 用法: ./bigdata_control.sh [start|stop|status]
# 组件顺序:
# 启动: HDFS -> YARN -> ZooKeeper -> Hive -> Kafka -> Flink
# 关闭: Flink -> Kafka -> Hive -> ZooKeeper -> YARN -> HDFS


# 颜色定义
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color


# 日志文件
LOG_DIR="$HOME/logs"
mkdir -p $LOG_DIR
LOG_FILE="$LOG_DIR/bigdata_control_$(date '+%Y%m%d').log"


# 检查命令行参数
if [ $# -ne 1 ] || [[ ! "$1" =~ ^(start|stop|status)$ ]]; then
    echo -e "${RED}用法错误: $0 [start|stop|status]${NC}"
    exit 1
fi


ACTION=$1


# 记录操作开始
echo "$(date '+%Y-%m-%d %H:%M:%S') - 开始执行 $ACTION 操作" >> $LOG_FILE


# 检查 Hive Metastore 进程
check_hive_metastore() {
    if ps aux | grep -v grep | grep "org.apache.hadoop.hive.metastore.HiveMetaStore" > /dev/null; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep "\-\-service metastore" > /dev/null; then
        return 0
    fi
    
    if netstat -tlnp 2>/dev/null | grep -q ":9083"; then
        return 0
    fi
    
    return 1
}


# 检查 HiveServer2 进程
check_hiveserver2() {
    if ps aux | grep -v grep | grep "org.apache.hive.service.server.HiveServer2" > /dev/null; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep "\-\-service hiveserver2" > /dev/null; then
        return 0
    fi
    
    if netstat -tlnp 2>/dev/null | grep -q ":10000"; then
        return 0
    fi
    
    return 1
}


# 检查 Flink JobManager 进程
check_flink_jobmanager() {
    if jps -l 2>/dev/null | grep -q "org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint"; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep -q "org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint"; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep -q "StandaloneSessionClusterEntrypoint"; then
        return 0
    fi
    
    if netstat -tlnp 2>/dev/null | grep -q ":8081"; then
        return 0
    fi
    
    if command -v curl &>/dev/null; then
        if curl -s -m 3 http://localhost:8081/overview 2>/dev/null | grep -q "taskmanagers"; then
            return 0
        fi
    fi
    
    return 1
}


# 检查 Flink TaskManager 进程
check_flink_taskmanager() {
    if jps -l 2>/dev/null | grep -q "org.apache.flink.runtime.taskexecutor.TaskManagerRunner"; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep -q "org.apache.flink.runtime.taskexecutor.TaskManagerRunner"; then
        return 0
    fi
    
    if ps aux | grep -v grep | grep -q "TaskManagerRunner"; then
        return 0
    fi
    
    if command -v curl &>/dev/null; then
        if curl -s -m 3 http://localhost:8081/taskmanagers 2>/dev/null | grep -q "taskmanagers"; then
            if ! curl -s -m 3 http://localhost:8081/taskmanagers 2>/dev/null | grep -q "\"taskmanagers\":\[\]"; then
                return 0
            fi
        fi
    fi
    
    return 1
}


# 通用进程检查函数
check_process() {
    local process_name=$1
    
    case "$process_name" in
        "HiveMetaStore")
            check_hive_metastore
            return $?
            ;;
        "HiveServer2")
            check_hiveserver2
            return $?
            ;;
        "StandaloneSessionClusterEntrypoint")
            check_flink_jobmanager
            return $?
            ;;
        "TaskManagerRunner")
            check_flink_taskmanager
            return $?
            ;;
        *)
            if jps 2>/dev/null | grep -i "$process_name" > /dev/null; then
                return 0
            fi
            return 1
            ;;
    esac
}


# 等待进程启动的函数
wait_for_process() {
    local process_name=$1
    local max_wait=$2
    local wait_time=0
    
    echo -ne "${YELLOW}等待 $process_name 启动"
    while [ $wait_time -lt $max_wait ]; do
        if check_process "$process_name"; then
            echo -e "${GREEN} [已启动]${NC}"
            return 0
        fi
        echo -n "."
        sleep 1
        wait_time=$((wait_time + 1))
    done
    
    if check_process "$process_name"; then
        echo -e "${GREEN} [已启动]${NC}"
        return 0
    fi
    
    echo -e "${RED} [超时]${NC}"
    return 1
}


# 等待进程停止的函数
wait_for_process_stop() {
    local process_name=$1
    local max_wait=$2
    local wait_time=0
    
    echo -ne "${YELLOW}等待 $process_name 停止"
    while [ $wait_time -lt $max_wait ]; do
        if ! check_process "$process_name"; then
            echo -e "${GREEN} [已停止]${NC}"
            return 0
        fi
        echo -n "."
        sleep 1
        wait_time=$((wait_time + 1))
    done
    echo -e "${RED} [超时]${NC}"
    return 1
}


# 启动组件的函数
start_component() {
    local component=$1
    local start_cmd=$2
    local process_check=$3
    local max_wait=${4:-30}
    
    echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')] 正在启动 $component...${NC}"
    echo "$(date '+%Y-%m-%d %H:%M:%S') - 启动 $component" >> $LOG_FILE
    
    if check_process "$process_check"; then
        echo -e "${GREEN}$component 已经在运行中${NC}"
        return 0
    fi
    
    eval "$start_cmd" >> $LOG_FILE 2>&1
    local start_result=$?
    
    if [ $start_result -ne 0 ]; then
        echo -e "${YELLOW}启动命令返回代码: $start_result,可能存在问题${NC}"
    fi
    
    if wait_for_process "$process_check" $max_wait; then
        echo -e "${GREEN}$component 启动成功${NC}"
        return 0
    else
        echo -e "${RED}$component 启动超时,但这不一定意味着失败${NC}"
        
        case "$component" in
            "Hive Metastore"|"HiveServer2"|"Flink JobManager")
                echo -e "${YELLOW}正在进行额外检查...${NC}"
                sleep 10
                if check_process "$process_check"; then
                    echo -e "${GREEN}$component 已成功启动${NC}"
                    return 0
                fi
                echo -e "${YELLOW}建议手动检查 $component 状态${NC}"
                ;;
        esac
        
        return 1
    fi
}


# 停止组件的函数
stop_component() {
    local component=$1
    local stop_cmd=$2
    local process_check=$3
    local max_wait=${4:-30}
    
    echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')] 正在停止 $component...${NC}"
    echo "$(date '+%Y-%m-%d %H:%M:%S') - 停止 $component" >> $LOG_FILE
    
    if ! check_process "$process_check"; then
        echo -e "${YELLOW}$component 未运行${NC}"
        return 0
    fi
    
    eval "$stop_cmd" >> $LOG_FILE 2>&1
    
    if wait_for_process_stop "$process_check" $max_wait; then
        echo -e "${GREEN}$component 已成功停止${NC}"
        return 0
    else
        echo -e "${RED}$component 停止超时,可能需要手动终止进程${NC}"
        return 1
    fi
}


# 获取 PID 函数
get_pid() {
    local process_name=$1
    local pid=""
    
    case "$process_name" in
        "HiveMetaStore")
            pid=$(ps aux | grep -v grep | grep "org.apache.hadoop.hive.metastore.HiveMetaStore" | awk '{print $2}' | head -1)
            if [ -z "$pid" ]; then
                pid=$(ps aux | grep -v grep | grep "\-\-service metastore" | awk '{print $2}' | head -1)
            fi
            ;;
        "HiveServer2")
            pid=$(ps aux | grep -v grep | grep "org.apache.hive.service.server.HiveServer2" | awk '{print $2}' | head -1)
            if [ -z "$pid" ]; then
                pid=$(ps aux | grep -v grep | grep "\-\-service hiveserver2" | awk '{print $2}' | head -1)
            fi
            ;;
        "StandaloneSessionClusterEntrypoint")
            pid=$(jps -l 2>/dev/null | grep "org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint" | awk '{print $1}' | head -1)
            if [ -z "$pid" ]; then
                pid=$(ps aux | grep -v grep | grep "StandaloneSessionClusterEntrypoint" | awk '{print $2}' | head -1)
            fi
            ;;
        "TaskManagerRunner")
            pid=$(jps -l 2>/dev/null | grep "org.apache.flink.runtime.taskexecutor.TaskManagerRunner" | awk '{print $1}' | head -1)
            if [ -z "$pid" ]; then
                pid=$(ps aux | grep -v grep | grep "TaskManagerRunner" | awk '{print $2}' | head -1)
            fi
            ;;
        *)
            pid=$(jps 2>/dev/null | grep -i "$process_name" | awk '{print $1}')
            ;;
    esac
    
    echo "$pid"
}


# 改进的组件状态检查函数
check_component_status() {
    local component=$1
    local process_check=$2
    
    echo -n "$component: "
    
    if check_process "$process_check"; then
        local pid=$(get_pid "$process_check")
        
        if [ ! -z "$pid" ]; then
            echo -e "${GREEN}运行中 (PID: $pid)${NC}"
        else
            echo -e "${GREEN}运行中${NC}"
        fi
    else
        echo -e "${RED}未运行${NC}"
    fi
}


# 启动所有组件
start_all() {
    echo -e "${BLUE}==============================================${NC}"
    echo -e "${BLUE}       开始启动所有大数据组件              ${NC}"
    echo -e "${BLUE}==============================================${NC}"
    echo ""
    
    # 1. 启动 HDFS
    start_component "HDFS" "start-dfs.sh" "NameNode" 60 || {
        echo -e "${RED}HDFS 启动失败,终止后续组件启动${NC}"
        return 1
    }
    
    # 检查 HDFS 健康状态
    echo "检查 HDFS 健康状态..."
    hdfs dfsadmin -report >> $LOG_FILE 2>&1
    if [ $? -ne 0 ]; then
        echo -e "${RED}HDFS 可能未正常运行,请检查${NC}"
    else
        echo -e "${GREEN}HDFS 运行正常${NC}"
    fi
    
    # 2. 启动 YARN
    start_component "YARN" "start-yarn.sh" "ResourceManager" 60 || {
        echo -e "${YELLOW}YARN 启动失败,但将继续启动其他组件${NC}"
    }
    
    # 3. 启动 ZooKeeper
    start_component "ZooKeeper" "$ZOOKEEPER_HOME/bin/zkServer.sh start" "QuorumPeerMain" 30 || {
        echo -e "${YELLOW}ZooKeeper 启动失败,但将继续启动其他组件${NC}"
    }
    
    # 4. 启动 Hive Metastore
    start_component "Hive Metastore" "nohup hive --service metastore &" "HiveMetaStore" 60 || {
        echo -e "${YELLOW}Hive Metastore 启动失败或超时,但将继续启动其他组件${NC}"
    }
    
    # 5. 启动 HiveServer2
    start_component "HiveServer2" "nohup hive --service hiveserver2 &" "HiveServer2" 60 || {
        echo -e "${YELLOW}HiveServer2 启动失败或超时,但将继续启动其他组件${NC}"
    }
    
    # 6. 启动 Kafka
    start_component "Kafka" "$KAFKA_HOME/bin/kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties" "Kafka" 45 || {
        echo -e "${YELLOW}Kafka 启动失败,但将继续启动其他组件${NC}"
    }
    
    # 7. 启动 Flink JobManager (standalone模式)
    start_component "Flink JobManager" "$FLINK_HOME/bin/start-cluster.sh" "StandaloneSessionClusterEntrypoint" 60 || {
        echo -e "${YELLOW}Flink 启动命令执行完成,但进程检测未通过${NC}"
        
        sleep 10
        if check_flink_jobmanager; then
            echo -e "${GREEN}Flink JobManager 已成功启动${NC}"
        else
            echo -e "${YELLOW}Flink JobManager 可能未正常启动,请手动检查${NC}"
        fi
    }
    
    echo ""
    echo -e "${BLUE}==============================================${NC}"
    echo -e "${GREEN}所有组件启动流程已完成${NC}"
    echo -e "${BLUE}==============================================${NC}"
}


# 停止所有组件
stop_all() {
    echo -e "${BLUE}==============================================${NC}"
    echo -e "${BLUE}       开始停止所有大数据组件              ${NC}"
    echo -e "${BLUE}==============================================${NC}"
    echo ""
    
    # 按照与启动相反的顺序停止组件
    
    # 1. 停止 Flink
    stop_component "Flink" "$FLINK_HOME/bin/stop-cluster.sh" "StandaloneSessionClusterEntrypoint" 30
    
    # 2. 停止 Kafka
    stop_component "Kafka" "$KAFKA_HOME/bin/kafka-server-stop.sh" "Kafka" 30
    
    # 3. 停止 HiveServer2
    stop_component "HiveServer2" "pkill -f 'org.apache.hive.service.server.HiveServer2' || pkill -f 'hive --service hiveserver2'" "HiveServer2" 30
    
    # 4. 停止 Hive Metastore
    stop_component "Hive Metastore" "pkill -f 'org.apache.hadoop.hive.metastore.HiveMetaStore' || pkill -f 'hive --service metastore'" "HiveMetaStore" 30
    
    # 5. 停止 ZooKeeper
    stop_component "ZooKeeper" "$ZOOKEEPER_HOME/bin/zkServer.sh stop" "QuorumPeerMain" 30
    
    # 6. 停止 YARN
    stop_component "YARN" "stop-yarn.sh" "ResourceManager" 60
    
    # 7. 停止 HDFS
    stop_component "HDFS" "stop-dfs.sh" "NameNode" 60
    
    echo ""
    echo -e "${BLUE}==============================================${NC}"
    echo -e "${GREEN}所有组件停止流程已完成${NC}"
    echo -e "${BLUE}==============================================${NC}"
}


# 显示所有组件状态
show_status() {
    echo -e "${BLUE}==============================================${NC}"
    echo -e "${BLUE}       大数据组件运行状态              ${NC}"
    echo -e "${BLUE}==============================================${NC}"
    echo ""
    
    check_component_status "NameNode" "NameNode"
    check_component_status "DataNode" "DataNode"
    check_component_status "ResourceManager" "ResourceManager"
    check_component_status "NodeManager" "NodeManager"
    check_component_status "ZooKeeper" "QuorumPeerMain"
    check_component_status "Hive Metastore" "HiveMetaStore"
    check_component_status "HiveServer2" "HiveServer2"
    check_component_status "Kafka" "Kafka"
    check_component_status "Flink JobManager" "StandaloneSessionClusterEntrypoint"
    check_component_status "Flink TaskManager" "TaskManagerRunner"
    
    echo ""
    echo -e "${BLUE}==============================================${NC}"
    
    # 显示 Flink REST API 状态
    if command -v curl &>/dev/null; then
        echo -e "${YELLOW}Flink REST API 状态:${NC}"
        if curl -s -m 3 http://localhost:8081/overview 2>/dev/null | grep -q "taskmanagers"; then
            echo -e "${GREEN}Flink REST API 正常响应${NC}"
        else
            echo -e "${RED}Flink REST API 无响应${NC}"
        fi
    fi
}


# 检查环境变量
check_env() {
    local missing_vars=0
    
    if [ -z "$ZOOKEEPER_HOME" ]; then
        echo -e "${YELLOW}警告: ZOOKEEPER_HOME 环境变量未设置${NC}"
        missing_vars=1
    fi
    
    if [ -z "$KAFKA_HOME" ]; then
        echo -e "${YELLOW}警告: KAFKA_HOME 环境变量未设置${NC}"
        missing_vars=1
    fi
    
    if [ -z "$FLINK_HOME" ]; then
        echo -e "${YELLOW}警告: FLINK_HOME 环境变量未设置${NC}"
        missing_vars=1
    fi
    
    if [ $missing_vars -eq 1 ]; then
        echo -e "${YELLOW}请设置必要的环境变量或修改脚本中的路径${NC}"
        echo ""
        
        if [ "$ACTION" != "status" ]; then
            echo -n -e "${YELLOW}是否继续执行? (y/n): ${NC}"
            read -r confirm
            if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
                echo -e "${RED}操作已取消${NC}"
                exit 1
            fi
        fi
    fi
}


# 主函数
main() {
    case "$ACTION" in
        start)
            start_all
            ;;
        stop)
            stop_all
            ;;
        status)
            show_status
            ;;
        *)
            echo -e "${RED}不支持的操作: $ACTION${NC}"
            exit 1
            ;;
    esac
    
    echo ""
    echo -e "${YELLOW}日志文件: $LOG_FILE${NC}"
}


# 检查环境变量
check_env


# 执行主函数
main

使用方式:
保存为了bigdata.sh

加入环境变量

mkdir ~/bin
mv bigdata.sh ~/bin/bigdata
chmod +x ~/bin/bigdata
echo 'export PATH="$HOME/bin:$PATH"' >> ~/.bashrc

使更改生效

source ~/.bashrc

测试命令

bigdata start

至此,安装全部结束!Happy Coding!

2025年4月20日