Skip to content

Commit

Permalink
feat(thirdparty): Bump Hadoop to 3.3.6
Browse files Browse the repository at this point in the history
  • Loading branch information
acelyc111 committed Sep 22, 2024
1 parent c90aa48 commit f2d74b9
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 74 deletions.
2 changes: 2 additions & 0 deletions .github/actions/rebuild_thirdparty_if_needed/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,6 @@ runs:
../build_tools/download_zk.sh zookeeper-bin
rm -rf hadoop-bin/share/doc
rm -rf zookeeper-bin/docs
mv hadoop-bin ..
mv zookeeper-bin ..
shell: bash
2 changes: 0 additions & 2 deletions .github/actions/upload_artifact/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ runs:
steps:
- name: Tar files
run: |
mv thirdparty/hadoop-bin ./
mv thirdparty/zookeeper-bin ./
rm -rf thirdparty
# The following operations are tricky, these directories and files don't exist if not build with '--test'.
# When build binaries for client tests, it's not needed to add '--test'.
Expand Down
21 changes: 16 additions & 5 deletions build_tools/download_hadoop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@

set -e

CWD=$(cd $(dirname $0) && pwd)
CWD=$(cd "$(dirname "$0")" && pwd)

if [ $# -ge 1 ]; then
HADOOP_BIN_PATH=$1
fi

HADOOP_VERSION=2.8.4
HADOOP_DIR_NAME=hadoop-${HADOOP_VERSION}
HADOOP_PACKAGE_MD5="b30b409bb69185003b3babd1504ba224"
${CWD}/download_package.sh ${HADOOP_DIR_NAME} ${HADOOP_PACKAGE_MD5} ${HADOOP_BIN_PATH}
HADOOP_VERSION="hadoop-3.3.6"
arch_output=$(arch)
if [ "$arch_output"x == "aarch64"x ]; then
HADOOP_PACKAGE_MD5="369f899194a920e0d1c3c3bc1718b3b5"
HADOOP_BASE_NAME=${HADOOP_VERSION}-"$(arch)"
else
if [ "$arch_output"x != "x86_64"x ]; then
echo "WARNING: unrecognized CPU architecture '$arch_output', use 'x86_64' as default"
fi
HADOOP_PACKAGE_MD5="1cbe1214299cd3bd282d33d3934b5cbd"
HADOOP_BASE_NAME=${HADOOP_VERSION}
fi

DOWNLOAD_BASE_URL="https://archive.apache.org/dist/hadoop/common/${HADOOP_VERSION}/"
"${CWD}"/download_package.sh "${HADOOP_BASE_NAME}" ${HADOOP_PACKAGE_MD5} "${HADOOP_BIN_PATH}" ${DOWNLOAD_BASE_URL} "${HADOOP_VERSION}"
52 changes: 31 additions & 21 deletions build_tools/download_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,59 +21,69 @@ set -e

if [ $# -lt 2 ]; then
echo "Invalid arguments !"
echo "USAGE: $0 <DIR_NAME> <PACKAGE_MD5> [TARGET_PATH]"
echo "USAGE: $0 <PACKAGE_BASE_NAME> <PACKAGE_MD5> [TARGET_PATH]"
exit 1
fi

DIR_NAME=$1
PACKAGE_BASE_NAME=$1
PACKAGE_MD5=$2

if [ $# -lt 3 ]; then
echo "TARGET_PATH is not provided, thus do not try to download ${DIR_NAME}"
echo "TARGET_PATH is not provided, thus do not try to download ${PACKAGE_BASE_NAME}"
exit 0
fi

TARGET_PATH=$3
if [ -d ${TARGET_PATH} ]; then
echo "TARGET_PATH ${TARGET_PATH} has existed, thus do not try to download ${DIR_NAME}"
if [ -d "${TARGET_PATH}" ]; then
echo "TARGET_PATH ${TARGET_PATH} has existed, thus do not try to download ${PACKAGE_BASE_NAME}"
exit 0
fi

PACKAGE_NAME=${DIR_NAME}.tar.gz
if [ ! -f ${PACKAGE_NAME} ]; then
echo "Downloading ${DIR_NAME}..."
DEFAULT_DOWNLOAD_BASE_URL="https://pegasus-thirdparty-package.oss-cn-beijing.aliyuncs.com/"
if [ $# -ge 4 ]; then
DEFAULT_DOWNLOAD_BASE_URL=$4
fi

DIR_NAME=${PACKAGE_BASE_NAME}
if [ $# -ge 5 ]; then
DIR_NAME=$5
fi

PACKAGE_NAME=${PACKAGE_BASE_NAME}.tar.gz
if [ ! -f "${PACKAGE_NAME}" ]; then
echo "Downloading ${PACKAGE_NAME} ..."

DOWNLOAD_URL="https://pegasus-thirdparty-package.oss-cn-beijing.aliyuncs.com/${PACKAGE_NAME}"
if ! wget -T 10 -t 5 ${DOWNLOAD_URL}; then
echo "ERROR: download ${DIR_NAME} failed"
DOWNLOAD_URL=${DEFAULT_DOWNLOAD_BASE_URL}${PACKAGE_NAME}
if ! wget -q -T 10 -t 5 "${DOWNLOAD_URL}"; then
echo "ERROR: download ${PACKAGE_NAME} failed"
exit 1
fi

if [ `md5sum ${PACKAGE_NAME} | awk '{print$1}'` != ${PACKAGE_MD5} ]; then
if [ "$(md5sum "${PACKAGE_NAME}" | awk '{print$1}')" != "${PACKAGE_MD5}" ]; then
echo "Check file ${PACKAGE_NAME} md5sum failed!"
exit 1
fi
fi

rm -rf ${DIR_NAME}
rm -rf "${DIR_NAME}"

echo "Decompressing ${DIR_NAME}..."
if ! tar xf ${PACKAGE_NAME}; then
echo "ERROR: decompress ${DIR_NAME} failed"
rm -f ${PACKAGE_NAME}
echo "Decompressing ${PACKAGE_NAME} ..."
if ! tar xf "${PACKAGE_NAME}"; then
echo "ERROR: decompress ${PACKAGE_NAME} failed"
rm -f "${PACKAGE_NAME}"
exit 1
fi

rm -f ${PACKAGE_NAME}
rm -f "${PACKAGE_NAME}"

if [ ! -d ${DIR_NAME} ]; then
if [ ! -d "${DIR_NAME}" ]; then
echo "ERROR: ${DIR_NAME} does not exist"
exit 1
fi

if [ -d ${TARGET_PATH} ]; then
if [ -d "${TARGET_PATH}" ]; then
echo "TARGET_PATH ${TARGET_PATH} has been generated, which means it and ${DIR_NAME} are the same dir thus do not do mv any more"
exit 0
fi

mv ${DIR_NAME} ${TARGET_PATH}
mv "${DIR_NAME}" "${TARGET_PATH}"
54 changes: 32 additions & 22 deletions build_tools/pack_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,30 +149,40 @@ pack_server_lib crypto $separate_servers
pack_server_lib ssl $separate_servers

# Pack hadoop-related files.
# If you want to use hdfs service to backup/restore/bulkload pegasus tables,
# you need to set env ${HADOOP_HOME}, edit ${HADOOP_HOME}/etc/hadoop/core-site.xml,
# and specify the keytab file.
if [ -n "$HADOOP_HOME" ] && [ -n "$keytab_file" ]; then
mkdir -p ${pack}/hadoop
copy_file $keytab_file ${pack}/hadoop
copy_file ${HADOOP_HOME}/etc/hadoop/core-site.xml ${pack}/hadoop
if [ -d $HADOOP_HOME/share/hadoop ]; then
for f in ${HADOOP_HOME}/share/hadoop/common/lib/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/common/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/hdfs/lib/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/hdfs/*.jar; do
copy_file $f ${pack}/hadoop
done
# If you want to use hdfs service to backup/restore/bulkload pegasus tables, you need to
# set env ${HADOOP_HOME} to the proper directory where contains Hadoop *.jar files.
if [ -n "$HADOOP_HOME" ]; then
# Verify one of the jars.
arch_output=$(arch)
if [ "$arch_output"x == "aarch64"x ]; then
HDFS_JAR_MD5="fcc09dbed936cd8673918774cc3ead6b"
else
if [ "$arch_output"x != "x86_64"x ]; then
echo "WARNING: unrecognized CPU architecture '$arch_output', use 'x86_64' as default"
fi
HDFS_JAR_MD5="f67f3a5613c885e1622b1056fd94262b"
fi
HDFS_JAR=${HADOOP_HOME}/share/hadoop/hdfs/hadoop-hdfs-3.3.6.jar
if [ "$(md5sum "${HDFS_JAR}" | awk '{print$1}')" != "${HDFS_JAR_MD5}" ]; then
echo "check file ${HDFS_JAR} md5sum failed!"
exit 1
fi
# Pack the jars.
mkdir -p ${pack}/hadoop
for f in ${HADOOP_HOME}/share/hadoop/common/lib/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/common/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/hdfs/lib/*.jar; do
copy_file $f ${pack}/hadoop
done
for f in ${HADOOP_HOME}/share/hadoop/hdfs/*.jar; do
copy_file $f ${pack}/hadoop
done
else
echo "Couldn't find env ${HADOOP_HOME} or no valid keytab file was specified,
hadoop-related files were not packed."
echo "Couldn't find env HADOOP_HOME, hadoop-related files were not packed."
fi

DISTRIB_ID=$(cat /etc/*-release | grep DISTRIB_ID | awk -F'=' '{print $2}')
Expand Down
11 changes: 7 additions & 4 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ export REPORT_DIR="$ROOT/test_report"
export THIRDPARTY_ROOT=${PEGASUS_THIRDPARTY_ROOT:-"$ROOT/thirdparty"}
ARCH_TYPE=''
arch_output=$(arch)
if [ "$arch_output"x == "x86_64"x ]; then
ARCH_TYPE="amd64"
elif [ "$arch_output"x == "aarch64"x ]; then
if [ "$arch_output"x == "aarch64"x ]; then
ARCH_TYPE="aarch64"
else
echo "WARNING: unsupported CPU architecture '$arch_output', use 'x86_64' as default"
if [ "$arch_output"x != "x86_64"x ]; then
echo "WARNING: unrecognized CPU architecture '$arch_output', use 'x86_64' as default"
fi
ARCH_TYPE="amd64"
fi
export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/${ARCH_TYPE}:${JAVA_HOME}/jre/lib/${ARCH_TYPE}/server:${BUILD_LATEST_DIR}/output/lib:${THIRDPARTY_ROOT}/output/lib:${LD_LIBRARY_PATH}
# Disable AddressSanitizerOneDefinitionRuleViolation, see https://github.com/google/sanitizers/issues/1017 for details.
Expand Down Expand Up @@ -2105,6 +2106,8 @@ case $cmd in
;;
pack_server)
shift
# source the config_hdfs.sh to get the HADOOP_HOME.
source "${ROOT}"/scripts/config_hdfs.sh
PEGASUS_ROOT=$ROOT ./build_tools/pack_server.sh $*
;;
pack_client)
Expand Down
2 changes: 1 addition & 1 deletion src/sample/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ if [ "$arch_output"x == "x86_64"x ]; then
elif [ "$arch_output"x == "aarch64"x ]; then
ARCH_TYPE="aarch64"
else
echo "WARNING: unsupported CPU architecture '$arch_output', use 'x86_64' as default"
echo "WARNING: unrecognized CPU architecture '$arch_output', use 'x86_64' as default"
fi
export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/${ARCH_TYPE}:${JAVA_HOME}/jre/lib/${ARCH_TYPE}/server:${PEGASUS_THIRDPARTY_ROOT}/output/lib:$(pwd)/../../lib:${LD_LIBRARY_PATH}

Expand Down
59 changes: 40 additions & 19 deletions thirdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,51 @@ ExternalProject_Add(gperftools
DOWNLOAD_NO_PROGRESS true
)

ExternalProject_Add(abseil
URL ${OSS_URL_PREFIX}/abseil-20230802.1.zip
https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.1.zip
URL_MD5 5c6193dbc82834f8e762c6a28c9cc615
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${TP_OUTPUT}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DABSL_FIND_GOOGLETEST=OFF
-DCMAKE_CXX_STANDARD=17
DOWNLOAD_EXTRACT_TIMESTAMP true
DOWNLOAD_NO_PROGRESS true
)

ExternalProject_Add(protobuf
URL https://github.com/protocolbuffers/protobuf/archive/refs/tags/v27.0.tar.gz
URL_MD5 c96aaf02c8acea549d65bb7b2d549bf6
CMAKE_ARGS -DCMAKE_BUILD_TYPE=release
-Dprotobuf_BUILD_TESTS=OFF
-Dprotobuf_BUILD_PROTOC_BINARIES=ON
-Dprotobuf_BUILD_LIBUPB=ON
-Dprotobuf_ABSL_PROVIDER=package
-DBUILD_SHARED_LIBS=ON
-DBUILD_SHARED_HDFSPP=ON
-DHDFSPP_LIBRARY_ONLY=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_CXX_STANDARD=17
-DABSL_ROOT_DIR=${TP_OUTPUT}
-DCMAKE_INSTALL_PREFIX=${TP_OUTPUT}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
DEPENDS abseil
DOWNLOAD_EXTRACT_TIMESTAMP true
DOWNLOAD_NO_PROGRESS true
)

set(HDFS_CLIENT_DIR "hadoop-hdfs-project/hadoop-hdfs-native-client")
ExternalProject_Add(hadoop
URL ${OSS_URL_PREFIX}/hadoop-release-2.8.4.tar.gz
https://github.com/apache/hadoop/archive/refs/tags/rel/release-2.8.4.tar.gz
URL_MD5 a1be737d4bff14923689619ab6545a96
PATCH_COMMAND ""
URL https://mirrors.aliyun.com/apache/hadoop/common/hadoop-3.3.6/hadoop-3.3.6-src.tar.gz
URL_MD5 285c07d8ad2c837c8ee04a4fa49c73cd
PATCH_COMMAND patch -p1 < ${TP_DIR}/fix_hdfs_native_client.patch
COMMAND cd ${HDFS_CLIENT_DIR} && mvn package -Pdist,native -DskipTests -Dmaven.javadoc.skip=true -Dtar
COMMAND cd ${HDFS_CLIENT_DIR} && cp -R target/hadoop-hdfs-native-client-2.8.4/include/. ${TP_OUTPUT}/include/hdfs && cp -R target/hadoop-hdfs-native-client-2.8.4/lib/native/. ${TP_OUTPUT}/lib
COMMAND cd ${HDFS_CLIENT_DIR} && cp -R target/hadoop-hdfs-native-client-3.3.6/include/. ${TP_OUTPUT}/include/hdfs && cp -R target/hadoop-hdfs-native-client-3.3.6/lib/native/. ${TP_OUTPUT}/lib
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
DEPENDS protobuf
DOWNLOAD_EXTRACT_TIMESTAMP true
DOWNLOAD_NO_PROGRESS true
)
Expand Down Expand Up @@ -303,18 +337,6 @@ ExternalProject_Add(nlohmann_json
DOWNLOAD_NO_PROGRESS true
)

ExternalProject_Add(abseil
URL ${OSS_URL_PREFIX}/abseil-20230802.1.zip
https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.1.zip
URL_MD5 5c6193dbc82834f8e762c6a28c9cc615
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${TP_OUTPUT}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DABSL_FIND_GOOGLETEST=OFF
-DCMAKE_CXX_STANDARD=17
DOWNLOAD_EXTRACT_TIMESTAMP true
DOWNLOAD_NO_PROGRESS true
)

ExternalProject_Add(s2geometry
URL ${OSS_URL_PREFIX}/s2geometry-0.10.0.tar.gz
https://github.com/google/s2geometry/archive/refs/tags/v0.10.0.tar.gz
Expand Down Expand Up @@ -357,8 +379,7 @@ set(SNAPPY_OPTIONS
-DSNAPPY_FUZZING_BUILD=OFF
-DSNAPPY_INSTALL=ON)
execute_process(COMMAND arch OUTPUT_VARIABLE ARCH_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "ARCH_NAME = ${ARCH_NAME}")
if (ARCH_NAME EQUAL "x86_64")
if (ARCH_NAME STREQUAL "x86_64")
set(SNAPPY_OPTIONS
${SNAPPY_OPTIONS}
-DSNAPPY_REQUIRE_AVX=ON
Expand Down
32 changes: 32 additions & 0 deletions thirdparty/fix_hdfs_native_client.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
index 24ec297aa27b..e77c38435bba 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
@@ -152,24 +152,13 @@ add_subdirectory(main/native/libhdfs-examples)

# Temporary fix to disable Libhdfs++ build on older systems that do not support thread_local
include(CheckCXXSourceCompiles)
-unset (THREAD_LOCAL_SUPPORTED CACHE)
set (CMAKE_REQUIRED_DEFINITIONS "-std=c++11")
set (CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
-check_cxx_source_compiles(
- "#include <thread>
- int main(void) {
- thread_local int s;
- return 0;
- }"
- THREAD_LOCAL_SUPPORTED)
-if (THREAD_LOCAL_SUPPORTED)
+if (REQUIRE_LIBHDFSPP)
add_subdirectory(main/native/libhdfspp)
else()
- message(WARNING
- "WARNING: Libhdfs++ library was not built because the required feature thread_local storage \
- is not supported by your compiler. Known compilers that support this feature: GCC 4.8+, Visual Studio 2015+, \
- Clang (community version 3.3+), Clang (version for Xcode 8+ and iOS 9+).")
-endif (THREAD_LOCAL_SUPPORTED)
+ message(WARNING "WARNING: Libhdfs++ library was not built because the REQUIRE_LIBHDFSPP is not enabled.")
+endif (REQUIRE_LIBHDFSPP)

if(REQUIRE_LIBWEBHDFS)
add_subdirectory(contrib/libwebhdfs)

0 comments on commit f2d74b9

Please sign in to comment.