选型版本
- JDK 1.8
- Hadoop 3.3.6
- Spark 3.5.1
- Scala 2.12.19
- Hive 3.1.3
- Hbase 2.2.3(phoenix-hbase 2.2-5.1.3)
- Flink 1.16.3
- Flink CDC 3.0.1
安装配置
Hadoop
修改hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/share/data_warehouse/hadoop-3.3.6/hdfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/mnt/hdfs/NAH69E9X,/mnt/hdfs/NAGXWN4X,/mnt/hdfs/Z2934NSK00009246V4NY,/mnt/hdfs/NAGW6SMX</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
<description>副本个数,配置默认是3,应小于datanode机器数量</description>
</property>
<!-- 设置SecondNameNode进程运行机器位置信息 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>localhost:9870</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.http.address</name>
<value>0.0.0.0:50070</value>
<description>将绑定IP改为0.0.0.0,而不是本地回环IP,这样,就能够实现外网访问本机的50070端口了</description>
</property>
<!-- 设置数据存储策略,默认为轮询,现在的情况显然应该用“选择空间多的磁盘存”模式 -->
<property>
<name>dfs.datanode.fsdataset.volume.choosing.policy</name>
<value>org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy</value>
</property>
<!-- 默认值0.75。它的含义是数据块存储到可用空间多的卷上的概率,由此可见,这个值如果取0.5以下,对该策略而言是毫无意义的,一般就采用默认值。-->
<property>
<name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction</name>
<value>0.75f</value>
</property>
<!-- 配置各个磁盘的均衡阈值的,默认为10G(10737418240),在此节点的所有数据存储的目录中,找一个占用最大的,找一个占用最小的,如果在两者之差在10G的范围内,那么块分配的方式是轮询。 -->
<property>
<name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold</name>
<value>10737418240</value>
</property>
</configuration>
hadoop-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_202
export HADOOP_HOME=/usr/share/data_warehouse/hadoop-3.3.6
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
export HADOOP_CLIENT_OPTS="-Xmx2048m $HADOOP_CLIENT_OPTS"
mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 设置MR程序默认运行模式: yarn集群模式 local本地模式 -->
<property>
<name>mapreduce.framework.name</name>
<value>local</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>1536</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx1024M</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>3072</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx2560M</value>
</property>
<!-- MR程序历史服务地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>localhost:10020</value>
</property>
<!-- MR程序历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>localhost:19888</value>
</property>
</configuration>
yarn-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>0.0.0.0:8088</value>
</property>
<!-- Site specific YARN configuration properties -->
<!-- 设置YARN集群主角色运行机器位置 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>localhost</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 是否将对容器实施物理内存限制 -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<!-- 是否将对容器实施虚拟内存限制。 -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 开启日志聚集 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置yarn历史服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://localhost:19888/jobhistory/logs</value>
</property>
<!-- 历史日志保存的时间 7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
</configuration>
Hive
删除lib里的sl4j jar包
hive-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- jdbc连接的URL -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?useSSL=false</value>
</property>
<!-- jdbc连接的Driver-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.mariadb.jdbc.Driver</value>
</property>
<!-- jdbc连接的username-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<!-- jdbc连接的password -->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<!-- Hive默认在HDFS的工作目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/pixivic/hive</value>
</property>
<!-- 指定hiveserver2连接的host -->
<property>
<name>hive.server2.thrift.bind.host</name>
<value>0.0.0.0</value>
</property>
<!-- 指定hiveserver2连接的端口号 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://0.0.0.0:9083</value>
</property>
</configuration>
hive-env.sh
HADOOP_HOME=/usr/share/data_warehouse/hadoop-3.3.6
JAVA_HOME=/usr/local/jdk1.8.0_202
修改metastore相关表字符集
解决desc命令注释中文乱码:修改hive存储在mysql里的元数据相关信息
1).修改字段注释字符集
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
2).修改表注释字符集
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
3).修改分区表参数,以支持分区键能够用中文表示
alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;
4).修改索引注解
alter table INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
mysql
SET sql_mode='NO_BACKSLASH_ESCAPES';
SET GLOBAL sql_mode = 'NO_BACKSLASH_ESCAPES';
commit;
Hbase
hbase-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_202
hbase-site.xml
<configuration>
<property>
<name>hbase.cluster.distributed</name>
<value>false</value>
</property>
<property>
<name>hbase.unsafe.stream.capability.enforce</name>
<value>false</value>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>./zookeeper</value>
</property>
<property>
<name>hbase.rootdir</name>
<value>hdfs://localhost:9000/user/pixivic/hbase</value>
</property>
<property>
<name>hbase.wal.provider</name>
<value>filesystem</value>
</property>
<property>
<name>phoenix.schema.isNamespaceMappingEnabled</name>
<value>true</value>
</property>
<property>
<name>phoenix.schema.mapSystemTablesToNamespace</name>
<value>true</value>
</property>
</configuration>
phoenix安装https://phoenix.apache.org/installation.html
Scala
curl -fL https://github.com/coursier/coursier/releases/latest/download/cs-x86_64-pc-linux.gz | gzip -d > cs && chmod +x cs
cs install scala:2.12.2 && cs install scalac:2.12.2
Spark
下载without hadoop版本,以单机模式运行。
连接hive
下载with hadoop版本,把里面的jars复制到without hadoop版本中
复制hive-site.xml到conf文件夹
编辑spark-defaults.conf
spark.sql.hive.metastore.jars=path
spark.sql.hive.metastore.jars.path=file:///usr/share/data_warehouse/apache-hive-3.1.3-bin/lib/*.jar
spark.sql.hive.metastore.version=3.1.3
#spark.sql.catalogImplementation=hive
spark.sql.hive.metastore.sharedPrefixes=org.mariadb.jdbc
编辑spark-env.sh
export SPARK_HOME=/usr/share/data_warehouse/spark-3.5.1-bin-without-hadoop
export HADOOP_HOME=/usr/share/data_warehouse/hadoop-3.3.6
export PATH=$PATH:$SPARK_HOME/bin:$HADOOP_HOME/bin
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
export export SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)
连接hbase
克隆 https://github.com/apache/hbase-connectors.git仓库,以编译连接器
git clone https://github.com/apache/hbase-connectors.git
cd hbase-connectors/spark/
mvn -Dspark.version=3.5.1 -Dscala.version=2.12.19 -Dscala.binary.version=2.12 -Dhbase.version=2.3.0 -Dhadoop-three.version=3.3.6 -DskipTests clean package
find . -type f -name hbase-spark-protocol-shaded-1.1.0-SNAPSHOT.jar
find . -type f -name hbase-spark-1.1.0-SNAPSHOT.jar
find . -type f -name scala-library-2.12.19.jar
复制三个jar到jars文件夹
Flink
可以使用pom.xml,然后执行下载jar包
mvn -f pom.xml dependency:copy-dependencies
lib中应该包含的jar包
-rw-r--r-- 1 root root 616888 May 23 00:04 commons-configuration2-2.1.1.jar
-rw-r--r-- 1 root root 61829 May 23 00:05 commons-logging-1.2.jar
-rw-r--r-- 1 510 staff 198816 Nov 13 2023 flink-cep-1.16.3.jar
-rw-r--r-- 1 510 staff 516135 Nov 13 2023 flink-connector-files-1.16.3.jar
-rw-r--r-- 1 root root 26450 May 22 21:42 flink-connector-hbase-2.2-1.16.3.jar
-rw-r--r-- 1 510 staff 102470 Nov 13 2023 flink-csv-1.16.3.jar
-rw-r--r-- 1 510 staff 117263528 Nov 13 2023 flink-dist-1.16.3.jar
-rw-r--r-- 1 510 staff 180247 Nov 13 2023 flink-json-1.16.3.jar
-rw-r--r-- 1 510 staff 21052641 Nov 13 2023 flink-scala_2.12-1.16.3.jar
-rw-r--r-- 1 510 staff 10737871 Feb 17 2022 flink-shaded-zookeeper-3.5.9.jar
-rw-r--r-- 1 root root 26607719 May 26 12:23 flink-sql-connector-hbase-2.2-1.16.3.jar
-rw-r--r-- 1 root root 23715175 May 22 15:13 flink-sql-connector-mysql-cdc-3.0.1.jar
-rw-r--r-- 1 510 staff 15365908 Nov 13 2023 flink-table-api-java-uber-1.16.3.jar
-rw-r--r-- 1 510 staff 36253165 Nov 13 2023 flink-table-planner-loader-1.16.3.jar
-rw-r--r-- 1 510 staff 3151254 Nov 13 2023 flink-table-runtime-1.16.3.jar
-rw-r--r-- 1 root root 106145 May 23 00:05 hadoop-auth-3.3.6.jar
-rw-r--r-- 1 root root 8573 Jun 18 2023 hadoop-client-3.3.6.jar
-rw-r--r-- 1 root root 4603101 Jun 18 2023 hadoop-common-3.3.6.jar
-rw-r--r-- 1 root root 6304455 Jun 18 2023 hadoop-hdfs-3.3.6.jar
-rw-r--r-- 1 root root 1773049 May 23 00:04 hadoop-mapreduce-client-core-3.3.6.jar
-rw-r--r-- 1 root root 3362359 May 22 23:52 hadoop-shaded-guava-1.1.1.jar
-rw-r--r-- 1 root root 1502280 May 23 00:06 htrace-core4-4.1.0-incubating.jar
-rw-r--r-- 1 510 staff 208006 Jan 14 2022 log4j-1.2-api-2.17.1.jar
-rw-r--r-- 1 510 staff 301872 Jan 14 2022 log4j-api-2.17.1.jar
-rw-r--r-- 1 510 staff 1790452 Jan 14 2022 log4j-core-2.17.1.jar
-rw-r--r-- 1 510 staff 24279 Jan 14 2022 log4j-slf4j-impl-2.17.1.jar
-rw-r--r-- 1 root root 161867 Feb 28 2014 stax2-api-3.1.4.jar
-rw-r--r-- 1 root root 512742 Aug 24 2016 woodstox-core-5.0.3.jar
Flink SQL一些优化
-- checkpoint设置
-- 后端
SET 'state.backend' = 'rocksdb';
-- 生成间隔
SET 'execution.checkpointing.interval' = '4h';
-- 生成模式
SET 'execution.checkpointing.mode' = 'AT_LEAST_ONCE';
-- 超时
SET 'execution.checkpointing.aligned-checkpoint-timeout' = '10s';
SET 'table.exec.checkpoint.timeout' = '600s';
SET 'execution.checkpointing.timeout' = '600s';
-- 压缩
SET 'state.backend.checkpoint-compression' = 'true';
-- 恢复
SET 'state.backend.local-recovery' = 'true';
-- ckpt失败允许
SET 'execution.checkpointing.tolerable-failed-checkpoints' = '10000';
-- 重启次数
SET 'restart-strategy.fixed-delay.attempts' = '2147483647';
SET 'akka.ask.timeout' = '60s';
SET 'execution.parallelism' = '1';
SET 'execution.checkpointing.min-pause' = '1000';
SET 'execution.checkpointing.max-concurrent' = '1';
SET 'execution.checkpointing.externalized-checkpoint-retention' = 'RETAIN_ON_CANCELLATION';
-- 异步
SET 'execution.checkpointing.unaligned.enabled' = 'true';
-- 增量
SET 'state.backend.incremental' = 'true';
-- 重启策略设置
SET 'restart-strategy.type' = 'fixed-delay';
/etc/profile
JAVA_HOME=/usr/local/jdk1.8.0_202
HADOOP_HOME=/usr/share/data_warehouse/hadoop-3.3.6
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
CLASSPATH=$JAVA_HOME/lib/
PATH=$PATH:$JAVA_HOME/bin
export PATH JAVA_HOME CLASSPATH HADOOP_HOME
export SPARK_HOME=/usr/share/data_warehouse/spark-3.5.1-bin-without-hadoop
export PATH=$PATH:$SPARK_HOME/bin
export PATH=$PATH:$SPARK_HOME/sbin
export HIVE_HOME=/usr/share/data_warehouse/apache-hive-3.1.3-bin
export PATH=$HIVE_HOME/bin:$PATH
#export HBASE_HOME=/usr/share/data_warehouse/hbase-2.4.17
#export HBASE_HOME=/usr/share/data_warehouse/hbase-2.5.8
export HBASE_HOME=/usr/share/data_warehouse/hbase-2.2.3
export PATH=$HBASE_HOME/bin:$PATH
export FLINK_HOME=/usr/share/data_warehouse/flink-1.16.3
export M2_HOME=/usr/local/apache-maven-3.9.6
export PATH=${M2_HOME}/bin:${PATH}
Comments | NOTHING