Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/master' into size_in_bytes_api
Browse files Browse the repository at this point in the history
  • Loading branch information
SemyonSinchenko committed Aug 28, 2024
2 parents 664edc4 + aa208ba commit 0a099bc
Show file tree
Hide file tree
Showing 356 changed files with 10,623 additions and 19,655 deletions.
12 changes: 5 additions & 7 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1112,14 +1112,12 @@ jobs:
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: start minikube
run: |
# See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
sudo install minikube-linux-amd64 /usr/local/bin/minikube
rm minikube-linux-amd64
- name: Start Minikube
uses: medyagh/[email protected]
with:
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
minikube start --cpus 2 --memory 6144
cpus: 2
memory: 6144m
- name: Print K8S pods and nodes info
run: |
kubectl get pods -A
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_sparkr_window.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
shell: cmd
env:
NOT_CRAN: true
SPARK_TESTING: 1
# See SPARK-27848. Currently installing some dependent packages causes
# "(converted from warning) unable to identify current timezone 'C':" for an unknown reason.
# This environment variable works around to test SparkR against a higher version.
Expand Down
5 changes: 5 additions & 0 deletions R/pkg/R/sparkR.R
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,11 @@ sparkR.session <- function(
enableHiveSupport = TRUE,
...) {

if (Sys.getenv("SPARK_TESTING") == "") {
warning(
"SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.")
}

sparkConfigMap <- convertNamedListToEnv(sparkConfig)
namedParams <- list(...)
if (length(namedParams) > 0) {
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# R on Spark
# R on Spark (deprecated)

SparkR is an R package that provides a light-weight frontend to use Spark from R.

Expand Down
2 changes: 2 additions & 0 deletions R/pkg/vignettes/sparkr-vignettes.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt, sep = " "))
```

SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.

## Overview

SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](https://spark.apache.org/mllib/).
Expand Down
33 changes: 24 additions & 9 deletions connector/docker/spark-test/base/Dockerfile → binder/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,29 @@
# limitations under the License.
#

FROM ubuntu:20.04
FROM python:3.10-slim
# install the notebook package
RUN pip install --no-cache notebook jupyterlab

# Upgrade package index
# install a few other useful packages plus Open Java 17
# Remove unneeded /var/lib/apt/lists/* after install to reduce the
# docker image size (by ~30MB)
RUN apt-get update && \
apt-get install -y less openjdk-17-jre-headless iproute2 vim-tiny sudo openssh-server && \
rm -rf /var/lib/apt/lists/*
# create user with a home directory
ARG NB_USER
ARG NB_UID
ENV USER ${NB_USER}
ENV HOME /home/${NB_USER}

RUN adduser --disabled-password \
--gecos "Default user" \
--uid ${NB_UID} \
${NB_USER}
WORKDIR ${HOME}
USER ${USER}

# Make sure the contents of our repo are in ${HOME}
COPY . ${HOME}
USER root
RUN chown -R ${NB_UID} ${HOME}
RUN apt-get update && apt-get install -y openjdk-17-jre git coreutils
USER ${NB_USER}

RUN binder/postBuild

ENV SPARK_HOME /opt/spark
2 changes: 0 additions & 2 deletions binder/apt.txt

This file was deleted.

2 changes: 1 addition & 1 deletion binder/postBuild
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ set -o pipefail
set -e

VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)")
TAG=$(git describe --tags --exact-match 2>/dev/null)
TAG=$(git describe --tags --exact-match 2> /dev/null || true)

# If a commit is tagged, exactly specified version of pyspark should be installed to avoid
# a kind of accident that an old version of pyspark is installed in the live notebook environment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,11 @@ public static int indexOf(final UTF8String target, final UTF8String pattern,
// Initialize the string search with respect to the specified ICU collation.
String targetStr = target.toValidString();
String patternStr = pattern.toValidString();
// Check if `start` is out of bounds. The provided offset `start` is given in number of
// codepoints, so a simple `targetStr.length` check is not sufficient here. This check is
// needed because `String.offsetByCodePoints` throws an `IndexOutOfBoundsException`
// exception when the offset is out of bounds.
if (targetStr.codePointCount(0, targetStr.length()) <= start) return MATCH_NOT_FOUND;
StringSearch stringSearch =
CollationFactory.getStringSearch(targetStr, patternStr, collationId);
stringSearch.setOverlapping(true);
Expand Down Expand Up @@ -990,20 +995,29 @@ public static UTF8String lowercaseTrimLeft(
while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));

// Iterate over `srcString` from the left to find the first character that is not in the set.
int searchIndex = 0, codePoint;
int searchIndex = 0, codePoint, codePointBuffer = -1;
Iterator<Integer> srcIter = srcString.codePointIterator();
while (srcIter.hasNext()) {
codePoint = getLowercaseCodePoint(srcIter.next());
// Get the next code point from either the buffer or the iterator.
if (codePointBuffer != -1) {
codePoint = codePointBuffer;
codePointBuffer = -1;
}
else {
codePoint = getLowercaseCodePoint(srcIter.next());
}
// Special handling for Turkish dotted uppercase letter I.
if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() &&
trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
int nextCodePoint = getLowercaseCodePoint(srcIter.next());
if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
|| nextCodePoint == CODE_POINT_COMBINING_DOT) {
codePointBuffer = codePoint;
codePoint = getLowercaseCodePoint(srcIter.next());
if (codePoint == CODE_POINT_COMBINING_DOT) {
searchIndex += 2;
}
else {
if (trimChars.contains(codePoint)) ++searchIndex;
codePointBuffer = -1;
} else if (trimChars.contains(codePointBuffer)) {
++searchIndex;
codePointBuffer = codePoint;
} else {
break;
}
} else if (trimChars.contains(codePoint)) {
Expand Down Expand Up @@ -1100,20 +1114,28 @@ public static UTF8String lowercaseTrimRight(
while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));

// Iterate over `srcString` from the right to find the first character that is not in the set.
int searchIndex = srcString.numChars(), codePoint;
int searchIndex = srcString.numChars(), codePoint, codePointBuffer = -1;
Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
while (srcIter.hasNext()) {
codePoint = getLowercaseCodePoint(srcIter.next());
if (codePointBuffer != -1) {
codePoint = codePointBuffer;
codePointBuffer = -1;
}
else {
codePoint = getLowercaseCodePoint(srcIter.next());
}
// Special handling for Turkish dotted uppercase letter I.
if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() &&
trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
int nextCodePoint = getLowercaseCodePoint(srcIter.next());
if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
|| nextCodePoint == CODE_POINT_LOWERCASE_I) {
codePointBuffer = codePoint;
codePoint = getLowercaseCodePoint(srcIter.next());
if (codePoint == CODE_POINT_LOWERCASE_I) {
searchIndex -= 2;
}
else {
if (trimChars.contains(codePoint)) --searchIndex;
codePointBuffer = -1;
} else if (trimChars.contains(codePointBuffer)) {
--searchIndex;
codePointBuffer = codePoint;
} else {
break;
}
} else if (trimChars.contains(codePoint)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2329,6 +2329,27 @@ public void testStringLocate() throws SparkException {
assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_LCASE", 5);
assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE", 5);
assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE_CI", 5);
// Out of bounds test cases.
assertStringLocate("a", "asd", 4, "UTF8_BINARY", 0);
assertStringLocate("a", "asd", 4, "UTF8_LCASE", 0);
assertStringLocate("a", "asd", 4, "UNICODE", 0);
assertStringLocate("a", "asd", 4, "UNICODE_CI", 0);
assertStringLocate("a", "asd", 100, "UTF8_BINARY", 0);
assertStringLocate("a", "asd", 100, "UTF8_LCASE", 0);
assertStringLocate("a", "asd", 100, "UNICODE", 0);
assertStringLocate("a", "asd", 100, "UNICODE_CI", 0);
assertStringLocate("a", "🙃🙃", 4, "UTF8_BINARY", 0);
assertStringLocate("a", "🙃🙃", 4, "UTF8_LCASE", 0);
assertStringLocate("a", "🙃🙃", 4, "UNICODE", 0);
assertStringLocate("a", "🙃🙃", 4, "UNICODE_CI", 0);
assertStringLocate("", "asd", 100, "UTF8_BINARY", 1);
assertStringLocate("", "asd", 100, "UTF8_LCASE", 1);
assertStringLocate("", "asd", 100, "UNICODE", 1);
assertStringLocate("", "asd", 100, "UNICODE_CI", 1);
assertStringLocate("asd", "", 100, "UTF8_BINARY", 0);
assertStringLocate("asd", "", 100, "UTF8_LCASE", 0);
assertStringLocate("asd", "", 100, "UNICODE", 0);
assertStringLocate("asd", "", 100, "UNICODE_CI", 0);
}

/**
Expand Down Expand Up @@ -2741,6 +2762,10 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_BINARY", "ixi", "i", "x");
assertStringTrim("UTF8_BINARY", "i", "İ", "i");
assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
assertStringTrim("UTF8_BINARY", "ii\u0307", "İi", "\u0307");
assertStringTrim("UTF8_BINARY", "iii\u0307", "İi", "\u0307");
assertStringTrim("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307");
assertStringTrim("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307");
assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307");
assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i");
assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", "");
Expand All @@ -2766,6 +2791,10 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_LCASE", "ixi", "i", "x");
assertStringTrim("UTF8_LCASE", "i", "İ", "i");
assertStringTrim("UTF8_LCASE", "i\u0307", "İ", "");
assertStringTrim("UTF8_LCASE", "ii\u0307", "İi", "");
assertStringTrim("UTF8_LCASE", "iii\u0307", "İi", "");
assertStringTrim("UTF8_LCASE", "iiii\u0307", "iİ", "");
assertStringTrim("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307");
assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i");
assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", "");
Expand All @@ -2791,6 +2820,10 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE", "ixi", "i", "x");
assertStringTrim("UNICODE", "i", "İ", "i");
assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307");
assertStringTrim("UNICODE", "ii\u0307", "İi", "i\u0307");
assertStringTrim("UNICODE", "iii\u0307", "İi", "i\u0307");
assertStringTrim("UNICODE", "iiii\u0307", "iİ", "i\u0307");
assertStringTrim("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307");
assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307");
assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307");
assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
Expand All @@ -2817,6 +2850,10 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE_CI", "ixi", "i", "x");
assertStringTrim("UNICODE_CI", "i", "İ", "i");
assertStringTrim("UNICODE_CI", "i\u0307", "İ", "");
assertStringTrim("UNICODE_CI", "ii\u0307", "İi", "");
assertStringTrim("UNICODE_CI", "iii\u0307", "İi", "");
assertStringTrim("UNICODE_CI", "iiii\u0307", "iİ", "");
assertStringTrim("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
Expand Down Expand Up @@ -3021,6 +3058,10 @@ public void testStringTrimLeft() throws SparkException {
assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi");
assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i");
assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
assertStringTrimLeft("UTF8_BINARY", "ii\u0307", "İi", "\u0307");
assertStringTrimLeft("UTF8_BINARY", "iii\u0307", "İi", "\u0307");
assertStringTrimLeft("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307");
assertStringTrimLeft("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307");
assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307");
assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307");
assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", "");
Expand All @@ -3046,6 +3087,10 @@ public void testStringTrimLeft() throws SparkException {
assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi");
assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i");
assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", "");
assertStringTrimLeft("UTF8_LCASE", "ii\u0307", "İi", "");
assertStringTrimLeft("UTF8_LCASE", "iii\u0307", "İi", "");
assertStringTrimLeft("UTF8_LCASE", "iiii\u0307", "iİ", "");
assertStringTrimLeft("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307");
assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307");
assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", "");
Expand All @@ -3071,6 +3116,10 @@ public void testStringTrimLeft() throws SparkException {
assertStringTrimLeft("UNICODE", "ixi", "i", "xi");
assertStringTrimLeft("UNICODE", "i", "İ", "i");
assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307");
assertStringTrimLeft("UNICODE", "ii\u0307", "İi", "i\u0307");
assertStringTrimLeft("UNICODE", "iii\u0307", "İi", "i\u0307");
assertStringTrimLeft("UNICODE", "iiii\u0307", "iİ", "i\u0307");
assertStringTrimLeft("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307");
assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307");
assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307");
assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
Expand All @@ -3097,6 +3146,10 @@ public void testStringTrimLeft() throws SparkException {
assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi");
assertStringTrimLeft("UNICODE_CI", "i", "İ", "i");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", "");
assertStringTrimLeft("UNICODE_CI", "ii\u0307", "İi", "");
assertStringTrimLeft("UNICODE_CI", "iii\u0307", "İi", "");
assertStringTrimLeft("UNICODE_CI", "iiii\u0307", "iİ", "");
assertStringTrimLeft("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
Expand Down Expand Up @@ -3302,6 +3355,10 @@ public void testStringTrimRight() throws SparkException {
assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix");
assertStringTrimRight("UTF8_BINARY", "i", "İ", "i");
assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307");
assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307");
assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307");
assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307");
assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307");
assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i");
assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", "");
Expand All @@ -3327,6 +3384,10 @@ public void testStringTrimRight() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix");
assertStringTrimRight("UTF8_LCASE", "i", "İ", "i");
assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", "");
assertStringTrimRight("UTF8_LCASE", "ii\u0307", "İi", "");
assertStringTrimRight("UTF8_LCASE", "iii\u0307", "İi", "");
assertStringTrimRight("UTF8_LCASE", "iiii\u0307", "iİ", "");
assertStringTrimRight("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307");
assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i");
assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", "");
Expand All @@ -3352,6 +3413,10 @@ public void testStringTrimRight() throws SparkException {
assertStringTrimRight("UNICODE", "ixi", "i", "ix");
assertStringTrimRight("UNICODE", "i", "İ", "i");
assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307");
assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307");
assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307");
assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307");
assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307");
assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307");
assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307");
assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
Expand All @@ -3378,6 +3443,10 @@ public void testStringTrimRight() throws SparkException {
assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix");
assertStringTrimRight("UNICODE_CI", "i", "İ", "i");
assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", "");
assertStringTrimRight("UNICODE_CI", "ii\u0307", "İi", "");
assertStringTrimRight("UNICODE_CI", "iii\u0307", "İi", "");
assertStringTrimRight("UNICODE_CI", "iiii\u0307", "iİ", "");
assertStringTrimRight("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
Expand Down
Loading

0 comments on commit 0a099bc

Please sign in to comment.