diff --git a/fess-crawler/pom.xml b/fess-crawler/pom.xml index ed149a55..46413389 100644 --- a/fess-crawler/pom.xml +++ b/fess-crawler/pom.xml @@ -341,7 +341,43 @@ - + + com.jcraft + jsch + 0.1.55 + + + com.github.lookfirst + sardine + 5.12 + + + org.eclipse.jgit + org.eclipse.jgit + 6.10.0.202406032230-r + + + software.amazon.awssdk + s3 + 2.28.25 + + + com.azure + azure-storage-blob + 12.28.1 + + + com.google.cloud + google-cloud-storage + 2.44.1 + + + com.google.code.findbugs + jsr305 + + + + junit junit ${junit.version} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/aws/AwsS3Client.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/aws/AwsS3Client.java new file mode 100644 index 00000000..4a61bc35 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/aws/AwsS3Client.java @@ -0,0 +1,361 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.aws; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.InputStream; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; + +import jakarta.annotation.Resource; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.ResponseInputStream; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.services.s3.model.GetObjectTaggingRequest; +import software.amazon.awssdk.services.s3.model.GetObjectTaggingResponse; +import software.amazon.awssdk.services.s3.model.HeadObjectRequest; +import software.amazon.awssdk.services.s3.model.HeadObjectResponse; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.services.s3.model.NoSuchKeyException; +import software.amazon.awssdk.services.s3.model.S3Object; + +/** + * A crawler client implementation for accessing and retrieving content from AWS S3. + * This client supports operations on Amazon S3 buckets and objects. + * + *

This client requires the following initialization parameters: + *

+ * + *

The client supports URLs in the format: {@code s3://bucket-name/object-key} + * + * @author shinsuke + */ +public class AwsS3Client extends AbstractCrawlerClient { + + private static final Logger logger = LogManager.getLogger(AwsS3Client.class); + + /** The character encoding to use for content. Defaults to UTF-8. */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length validation and limits. */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** Flag indicating whether the client has been initialized. */ + protected volatile boolean isInit = false; + + /** The AWS S3 client instance. */ + protected S3Client s3Client; + + /** + * Creates a new AwsS3Client instance. + */ + public AwsS3Client() { + super(); + } + + @Override + public synchronized void init() { + if (isInit) { + return; + } + + super.init(); + + final String region = getInitParameter("region", "us-east-1", String.class); + final String accessKey = getInitParameter("accessKey", null, String.class); + if (StringUtil.isBlank(accessKey)) { + throw new CrawlingAccessException("accessKey is blank."); + } + final String secretKey = getInitParameter("secretKey", null, String.class); + if (StringUtil.isBlank(secretKey)) { + throw new CrawlingAccessException("secretKey is blank."); + } + + try { + final AwsBasicCredentials credentials = AwsBasicCredentials.create(accessKey, secretKey); + s3Client = S3Client.builder() + .region(Region.of(region)) + .credentialsProvider(StaticCredentialsProvider.create(credentials)) + .build(); + } catch (final Exception e) { + throw new CrawlingAccessException("Failed to create AWS S3 client: region=" + region, e); + } + + isInit = true; + if (logger.isInfoEnabled()) { + logger.info("AWS S3 client initialized successfully: region={}", region); + } + } + + @Override + public void close() { + if (s3Client != null) { + s3Client.close(); + } + isInit = false; + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes an S3 request with timeout management. + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (!isInit) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data for the specified URI. + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + if (logger.isDebugEnabled()) { + logger.debug("Accessing S3 object: uri={}, includeContent={}", uri, includeContent); + } + + final ResponseData responseData = new ResponseData(); + try { + responseData.setMethod(includeContent ? Constants.GET_METHOD : Constants.HEAD_METHOD); + final String normalizedUri = normalizeUri(uri); + responseData.setUrl(normalizedUri); + + final String[] paths = parsePath(normalizedUri.replaceFirst("^s3:/+", StringUtil.EMPTY)); + final String bucketName = paths[0]; + final String key = paths[1]; + if (logger.isDebugEnabled()) { + logger.debug("Parsed S3 path: bucket={}, key={}", bucketName, key); + } + + HeadObjectResponse headResponse = null; + try { + headResponse = s3Client.headObject(HeadObjectRequest.builder().bucket(bucketName).key(key).build()); + } catch (final NoSuchKeyException e) { + if (logger.isDebugEnabled()) { + logger.debug("Object not found: bucket={}, key={}", bucketName, key); + } + } + + if (headResponse == null) { + // Try to list objects with prefix + final Set requestDataSet = new HashSet<>(); + final ListObjectsV2Request listRequest = + ListObjectsV2Request.builder().bucket(bucketName).prefix(key).delimiter("/").build(); + final ListObjectsV2Response listResponse = s3Client.listObjectsV2(listRequest); + + for (final S3Object s3Object : listResponse.contents()) { + final String objectKey = s3Object.key(); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url("s3://" + bucketName + "/" + objectKey).build()); + } + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + // Object found + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(headResponse.contentLength()); + checkMaxContentLength(responseData); + + if (headResponse.lastModified() != null) { + responseData.setLastModified(Date.from(headResponse.lastModified())); + } + if (headResponse.contentType() != null) { + responseData.setMimeType(headResponse.contentType()); + } + + if (contentLengthHelper != null) { + final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType()); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + + maxLength + " byte. The url is " + normalizedUri); + } + } + + if (includeContent) { + // Get object tags + try { + final GetObjectTaggingResponse taggingResponse = + s3Client.getObjectTagging(GetObjectTaggingRequest.builder().bucket(bucketName).key(key).build()); + taggingResponse.tagSet().forEach(tag -> responseData.addMetaData(tag.key(), tag.value())); + } catch (final Exception e) { + logger.warn("Failed to get object tags: bucket={}, key={}", bucketName, key, e); + } + + // Get object content + if (headResponse.contentLength() < maxCachedContentSize) { + final GetObjectRequest getRequest = GetObjectRequest.builder().bucket(bucketName).key(key).build(); + try (ResponseInputStream contentStream = + s3Client.getObject(getRequest); + InputStream in = new BufferedInputStream(contentStream)) { + responseData.setResponseBody(InputStreamUtil.getBytes(in)); + } catch (final Exception e) { + logger.warn("Failed to read S3 object content: bucket={}, key={}, size={}", bucketName, key, + headResponse.contentLength(), e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + } + } else { + File outputFile = null; + try { + outputFile = createTempFile("crawler-AwsS3Client-", ".out", null); + final GetObjectRequest getRequest = GetObjectRequest.builder().bucket(bucketName).key(key).build(); + try (ResponseInputStream in = s3Client.getObject(getRequest)) { + CopyUtil.copy(in, outputFile); + } + responseData.setResponseBody(outputFile, true); + if (logger.isDebugEnabled()) { + logger.debug( + "Object size exceeds cache threshold, using temp file: bucket={}, key={}, size={}, threshold={}, tempFile={}", + bucketName, key, headResponse.contentLength(), maxCachedContentSize, outputFile.getAbsolutePath()); + } + } catch (final Exception e) { + logger.warn("Failed to write S3 object to temp file: bucket={}, key={}, size={}, tempFile={}", bucketName, key, + headResponse.contentLength(), outputFile != null ? outputFile.getAbsolutePath() : "null", e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + FileUtil.deleteInBackground(outputFile); + } + } + + if (StringUtil.isBlank(responseData.getMimeType())) { + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (final InputStream is = responseData.getResponseBody()) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, key)); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, key)); + } + } + } + + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + return responseData; + } + + /** + * Parses an S3 path into bucket name and key components. + */ + protected String[] parsePath(final String path) { + if (StringUtil.isNotEmpty(path)) { + final String[] values = path.split("/", 2); + if (values.length == 2) { + return values; + } + if (values.length == 1 && StringUtil.isNotEmpty(values[0])) { + return new String[] { values[0], StringUtil.EMPTY }; + } + } + throw new CrawlingAccessException("Invalid path: " + path); + } + + /** + * Normalizes the URI. + */ + protected String normalizeUri(final String uri) { + if (StringUtil.isEmpty(uri)) { + throw new CrawlerSystemException("The uri is empty."); + } + String normalized = uri; + if (!normalized.startsWith("s3:")) { + normalized = "s3://" + normalized; + } + return normalized; + } + + /** + * Gets the character encoding. + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding. + */ + public void setCharset(final String charset) { + this.charset = charset; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/azure/AzureBlobClient.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/azure/AzureBlobClient.java new file mode 100644 index 00000000..14f03926 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/azure/AzureBlobClient.java @@ -0,0 +1,345 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.azure; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.InputStream; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; + +import com.azure.storage.blob.BlobClient; +import com.azure.storage.blob.BlobContainerClient; +import com.azure.storage.blob.BlobServiceClient; +import com.azure.storage.blob.BlobServiceClientBuilder; +import com.azure.storage.blob.models.BlobItem; +import com.azure.storage.blob.models.BlobProperties; +import com.azure.storage.blob.models.BlobStorageException; +import com.azure.storage.blob.models.ListBlobsOptions; + +import jakarta.annotation.Resource; + +/** + * A crawler client implementation for accessing and retrieving content from Azure Blob Storage. + * This client supports operations on Azure Storage containers and blobs. + * + *

This client requires the following initialization parameters: + *

+ * + *

The client supports URLs in the format: {@code azure://container-name/blob-name} + * + * @author shinsuke + */ +public class AzureBlobClient extends AbstractCrawlerClient { + + private static final Logger logger = LogManager.getLogger(AzureBlobClient.class); + + /** The character encoding to use for content. Defaults to UTF-8. */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length validation and limits. */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** Flag indicating whether the client has been initialized. */ + protected volatile boolean isInit = false; + + /** The Azure Blob Service client instance. */ + protected BlobServiceClient blobServiceClient; + + /** + * Creates a new AzureBlobClient instance. + */ + public AzureBlobClient() { + super(); + } + + @Override + public synchronized void init() { + if (isInit) { + return; + } + + super.init(); + + final String connectionString = getInitParameter("connectionString", null, String.class); + if (StringUtil.isBlank(connectionString)) { + throw new CrawlingAccessException("connectionString is blank."); + } + + try { + blobServiceClient = new BlobServiceClientBuilder().connectionString(connectionString).buildClient(); + } catch (final Exception e) { + throw new CrawlingAccessException("Failed to create Azure Blob Service client", e); + } + + isInit = true; + if (logger.isInfoEnabled()) { + logger.info("Azure Blob client initialized successfully"); + } + } + + @Override + public void close() { + isInit = false; + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes an Azure Blob request with timeout management. + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (!isInit) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data for the specified URI. + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + if (logger.isDebugEnabled()) { + logger.debug("Accessing Azure blob: uri={}, includeContent={}", uri, includeContent); + } + + final ResponseData responseData = new ResponseData(); + try { + responseData.setMethod(includeContent ? Constants.GET_METHOD : Constants.HEAD_METHOD); + final String normalizedUri = normalizeUri(uri); + responseData.setUrl(normalizedUri); + + final String[] paths = parsePath(normalizedUri.replaceFirst("^azure:/+", StringUtil.EMPTY)); + final String containerName = paths[0]; + final String blobName = paths[1]; + if (logger.isDebugEnabled()) { + logger.debug("Parsed Azure path: container={}, blob={}", containerName, blobName); + } + + final BlobContainerClient containerClient = blobServiceClient.getBlobContainerClient(containerName); + final BlobClient blobClient = containerClient.getBlobClient(blobName); + + BlobProperties properties = null; + try { + properties = blobClient.getProperties(); + } catch (final BlobStorageException e) { + if (e.getStatusCode() == 404) { + if (logger.isDebugEnabled()) { + logger.debug("Blob not found: container={}, blob={}", containerName, blobName); + } + } else { + throw e; + } + } + + if (properties == null) { + // Try to list blobs with prefix + final Set requestDataSet = new HashSet<>(); + final ListBlobsOptions options = new ListBlobsOptions().setPrefix(blobName); + + for (final BlobItem blobItem : containerClient.listBlobs(options, null)) { + final String itemName = blobItem.getName(); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url("azure://" + containerName + "/" + itemName).build()); + } + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + // Blob found + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(properties.getBlobSize()); + checkMaxContentLength(responseData); + + if (properties.getLastModified() != null) { + responseData.setLastModified(Date.from(properties.getLastModified().toInstant())); + } + if (properties.getContentType() != null) { + responseData.setMimeType(properties.getContentType()); + } + + // Add metadata + if (properties.getMetadata() != null) { + properties.getMetadata().forEach((key, value) -> responseData.addMetaData(key, value)); + } + + if (contentLengthHelper != null) { + final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType()); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + + maxLength + " byte. The url is " + normalizedUri); + } + } + + if (includeContent) { + // Add tags as metadata + try { + blobClient.getTags().forEach((key, value) -> responseData.addMetaData("tag_" + key, value)); + } catch (final Exception e) { + logger.warn("Failed to get blob tags: container={}, blob={}", containerName, blobName, e); + } + + // Get blob content + if (properties.getBlobSize() < maxCachedContentSize) { + try (InputStream contentStream = new BufferedInputStream(blobClient.openInputStream())) { + responseData.setResponseBody(InputStreamUtil.getBytes(contentStream)); + } catch (final Exception e) { + logger.warn("Failed to read Azure blob content: container={}, blob={}, size={}", containerName, blobName, + properties.getBlobSize(), e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + } + } else { + File outputFile = null; + try { + outputFile = createTempFile("crawler-AzureBlobClient-", ".out", null); + try (InputStream in = blobClient.openInputStream()) { + CopyUtil.copy(in, outputFile); + } + responseData.setResponseBody(outputFile, true); + if (logger.isDebugEnabled()) { + logger.debug( + "Blob size exceeds cache threshold, using temp file: container={}, blob={}, size={}, threshold={}, tempFile={}", + containerName, blobName, properties.getBlobSize(), maxCachedContentSize, outputFile.getAbsolutePath()); + } + } catch (final Exception e) { + logger.warn("Failed to write Azure blob to temp file: container={}, blob={}, size={}, tempFile={}", containerName, + blobName, properties.getBlobSize(), outputFile != null ? outputFile.getAbsolutePath() : "null", e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + FileUtil.deleteInBackground(outputFile); + } + } + + if (StringUtil.isBlank(responseData.getMimeType())) { + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (final InputStream is = responseData.getResponseBody()) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, blobName)); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, blobName)); + } + } + } + + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + return responseData; + } + + /** + * Parses an Azure path into container name and blob name components. + */ + protected String[] parsePath(final String path) { + if (StringUtil.isNotEmpty(path)) { + final String[] values = path.split("/", 2); + if (values.length == 2) { + return values; + } + if (values.length == 1 && StringUtil.isNotEmpty(values[0])) { + return new String[] { values[0], StringUtil.EMPTY }; + } + } + throw new CrawlingAccessException("Invalid path: " + path); + } + + /** + * Normalizes the URI. + */ + protected String normalizeUri(final String uri) { + if (StringUtil.isEmpty(uri)) { + throw new CrawlerSystemException("The uri is empty."); + } + String normalized = uri; + if (!normalized.startsWith("azure:")) { + normalized = "azure://" + normalized; + } + return normalized; + } + + /** + * Gets the character encoding. + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding. + */ + public void setCharset(final String charset) { + this.charset = charset; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClient.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClient.java new file mode 100644 index 00000000..162ae680 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClient.java @@ -0,0 +1,351 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.gcp; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.channels.Channels; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; + +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageException; +import com.google.cloud.storage.StorageOptions; + +import jakarta.annotation.Resource; + +/** + * A crawler client implementation for accessing and retrieving content from Google Cloud Storage. + * This client supports operations on GCS buckets and objects. + * + *

This client requires the following initialization parameters: + *

+ * + *

The client supports URLs in the format: {@code gs://bucket-name/object-name} + * + * @author shinsuke + */ +public class GoogleCloudStorageClient extends AbstractCrawlerClient { + + private static final Logger logger = LogManager.getLogger(GoogleCloudStorageClient.class); + + /** The character encoding to use for content. Defaults to UTF-8. */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length validation and limits. */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** Flag indicating whether the client has been initialized. */ + protected volatile boolean isInit = false; + + /** The Google Cloud Storage client instance. */ + protected Storage storage; + + /** + * Creates a new GoogleCloudStorageClient instance. + */ + public GoogleCloudStorageClient() { + super(); + } + + @Override + public synchronized void init() { + if (isInit) { + return; + } + + super.init(); + + final String projectId = getInitParameter("projectId", null, String.class); + if (StringUtil.isBlank(projectId)) { + throw new CrawlingAccessException("projectId is blank."); + } + + try { + final StorageOptions.Builder builder = StorageOptions.newBuilder().setProjectId(projectId); + + final String credentialsFile = getInitParameter("credentialsFile", null, String.class); + if (StringUtil.isNotBlank(credentialsFile)) { + try (FileInputStream credentialsStream = new FileInputStream(credentialsFile)) { + final GoogleCredentials credentials = GoogleCredentials.fromStream(credentialsStream); + builder.setCredentials(credentials); + } + } + + storage = builder.build().getService(); + } catch (final Exception e) { + throw new CrawlingAccessException("Failed to create Google Cloud Storage client: projectId=" + projectId, e); + } + + isInit = true; + if (logger.isInfoEnabled()) { + logger.info("Google Cloud Storage client initialized successfully: projectId={}", projectId); + } + } + + @Override + public void close() { + if (storage != null) { + try { + storage.close(); + } catch (final Exception e) { + logger.warn("Failed to close Google Cloud Storage client", e); + } + } + isInit = false; + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes a GCS request with timeout management. + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (!isInit) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data for the specified URI. + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + if (logger.isDebugEnabled()) { + logger.debug("Accessing GCS object: uri={}, includeContent={}", uri, includeContent); + } + + final ResponseData responseData = new ResponseData(); + try { + responseData.setMethod(includeContent ? Constants.GET_METHOD : Constants.HEAD_METHOD); + final String normalizedUri = normalizeUri(uri); + responseData.setUrl(normalizedUri); + + final String[] paths = parsePath(normalizedUri.replaceFirst("^gs:/+", StringUtil.EMPTY)); + final String bucketName = paths[0]; + final String objectName = paths[1]; + if (logger.isDebugEnabled()) { + logger.debug("Parsed GCS path: bucket={}, object={}", bucketName, objectName); + } + + Blob blob = null; + try { + blob = storage.get(BlobId.of(bucketName, objectName)); + } catch (final StorageException e) { + if (e.getCode() == 404) { + if (logger.isDebugEnabled()) { + logger.debug("Object not found: bucket={}, object={}", bucketName, objectName); + } + } else { + throw e; + } + } + + if (blob == null || !blob.exists()) { + // Try to list objects with prefix + final Set requestDataSet = new HashSet<>(); + final Iterable blobs = storage.list(bucketName, Storage.BlobListOption.prefix(objectName)).iterateAll(); + + for (final Blob blobItem : blobs) { + final String itemName = blobItem.getName(); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url("gs://" + bucketName + "/" + itemName).build()); + } + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + // Object found + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(blob.getSize()); + checkMaxContentLength(responseData); + + if (blob.getUpdateTimeOffsetDateTime() != null) { + responseData.setLastModified(Date.from(blob.getUpdateTimeOffsetDateTime().toInstant())); + } + if (blob.getContentType() != null) { + responseData.setMimeType(blob.getContentType()); + } + + // Add metadata + if (blob.getMetadata() != null) { + blob.getMetadata().forEach((key, value) -> responseData.addMetaData(key, value)); + } + + if (contentLengthHelper != null) { + final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType()); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + + maxLength + " byte. The url is " + normalizedUri); + } + } + + if (includeContent) { + // Get object content + if (blob.getSize() < maxCachedContentSize) { + try (InputStream contentStream = new BufferedInputStream(Channels.newInputStream(blob.reader()))) { + responseData.setResponseBody(InputStreamUtil.getBytes(contentStream)); + } catch (final Exception e) { + logger.warn("Failed to read GCS object content: bucket={}, object={}, size={}", bucketName, objectName, + blob.getSize(), e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + } + } else { + File outputFile = null; + try { + outputFile = createTempFile("crawler-GoogleCloudStorageClient-", ".out", null); + blob.downloadTo(outputFile.toPath()); + responseData.setResponseBody(outputFile, true); + if (logger.isDebugEnabled()) { + logger.debug( + "Object size exceeds cache threshold, using temp file: bucket={}, object={}, size={}, threshold={}, tempFile={}", + bucketName, objectName, blob.getSize(), maxCachedContentSize, outputFile.getAbsolutePath()); + } + } catch (final Exception e) { + logger.warn("Failed to write GCS object to temp file: bucket={}, object={}, size={}, tempFile={}", bucketName, + objectName, blob.getSize(), outputFile != null ? outputFile.getAbsolutePath() : "null", e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + FileUtil.deleteInBackground(outputFile); + } + } + + if (StringUtil.isBlank(responseData.getMimeType())) { + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (final InputStream is = responseData.getResponseBody()) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, objectName)); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, objectName)); + } + } + } + + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + return responseData; + } + + /** + * Parses a GCS path into bucket name and object name components. + */ + protected String[] parsePath(final String path) { + if (StringUtil.isNotEmpty(path)) { + final String[] values = path.split("/", 2); + if (values.length == 2) { + return values; + } + if (values.length == 1 && StringUtil.isNotEmpty(values[0])) { + return new String[] { values[0], StringUtil.EMPTY }; + } + } + throw new CrawlingAccessException("Invalid path: " + path); + } + + /** + * Normalizes the URI. + */ + protected String normalizeUri(final String uri) { + if (StringUtil.isEmpty(uri)) { + throw new CrawlerSystemException("The uri is empty."); + } + String normalized = uri; + if (!normalized.startsWith("gs:")) { + normalized = "gs://" + normalized; + } + return normalized; + } + + /** + * Gets the character encoding. + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding. + */ + public void setCharset(final String charset) { + this.charset = charset; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthentication.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthentication.java new file mode 100644 index 00000000..1b885d80 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthentication.java @@ -0,0 +1,157 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.git; + +import java.util.regex.Pattern; + +import org.codelibs.core.lang.StringUtil; + +/** + * Authentication information for Git connections. + * This class holds credentials for Git authentication. + * + * @author shinsuke + */ +public class GitAuthentication { + + /** The server URL pattern for matching. */ + protected Pattern serverPattern; + + /** The username for authentication. */ + protected String username; + + /** The password for authentication. */ + protected String password; + + /** The private key for public key authentication. */ + protected String privateKey; + + /** The passphrase for the private key. */ + protected String passphrase; + + /** + * Creates a new GitAuthentication instance. + */ + public GitAuthentication() { + // Default constructor + } + + /** + * Gets the server pattern. + * + * @return The server pattern. + */ + public Pattern getServerPattern() { + return serverPattern; + } + + /** + * Sets the server pattern. + * + * @param serverPattern The server pattern to set. + */ + public void setServerPattern(final Pattern serverPattern) { + this.serverPattern = serverPattern; + } + + /** + * Sets the server pattern from a string. + * + * @param serverPattern The server pattern string. + */ + public void setServer(final String serverPattern) { + if (StringUtil.isNotBlank(serverPattern)) { + this.serverPattern = Pattern.compile(serverPattern); + } + } + + /** + * Gets the username. + * + * @return The username. + */ + public String getUsername() { + return username; + } + + /** + * Sets the username. + * + * @param username The username to set. + */ + public void setUsername(final String username) { + this.username = username; + } + + /** + * Gets the password. + * + * @return The password. + */ + public String getPassword() { + return password; + } + + /** + * Sets the password. + * + * @param password The password to set. + */ + public void setPassword(final String password) { + this.password = password; + } + + /** + * Gets the private key. + * + * @return The private key. + */ + public String getPrivateKey() { + return privateKey; + } + + /** + * Sets the private key. + * + * @param privateKey The private key to set. + */ + public void setPrivateKey(final String privateKey) { + this.privateKey = privateKey; + } + + /** + * Gets the passphrase. + * + * @return The passphrase. + */ + public String getPassphrase() { + return passphrase; + } + + /** + * Sets the passphrase. + * + * @param passphrase The passphrase to set. + */ + public void setPassphrase(final String passphrase) { + this.passphrase = passphrase; + } + + @Override + public String toString() { + return "GitAuthentication [serverPattern=" + serverPattern + ", username=" + username + "]"; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthenticationHolder.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthenticationHolder.java new file mode 100644 index 00000000..91fc635e --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitAuthenticationHolder.java @@ -0,0 +1,72 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.git; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Holder for Git authentication information. + * This class manages multiple Git authentication credentials and matches them to repository URLs. + * + * @author shinsuke + */ +public class GitAuthenticationHolder { + + /** List of Git authentications. */ + protected List gitAuthenticationList = new ArrayList<>(); + + /** + * Creates a new GitAuthenticationHolder instance. + */ + public GitAuthenticationHolder() { + // Default constructor + } + + /** + * Adds a Git authentication to the holder. + * + * @param gitAuthentication The Git authentication to add. + */ + public void add(final GitAuthentication gitAuthentication) { + gitAuthenticationList.add(gitAuthentication); + } + + /** + * Gets the Git authentication that matches the given URL. + * + * @param url The URL to match. + * @return The matching Git authentication, or null if no match is found. + */ + public GitAuthentication get(final String url) { + if (url == null) { + return null; + } + + for (final GitAuthentication gitAuthentication : gitAuthenticationList) { + final Pattern pattern = gitAuthentication.getServerPattern(); + if (pattern != null) { + final Matcher matcher = pattern.matcher(url); + if (matcher.matches()) { + return gitAuthentication; + } + } + } + return null; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitClient.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitClient.java new file mode 100644 index 00000000..150d961f --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/git/GitClient.java @@ -0,0 +1,577 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.git; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.InputStream; +import java.net.URI; +import java.util.Date; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; +import org.eclipse.jgit.api.Git; +import org.eclipse.jgit.api.errors.GitAPIException; +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.lib.ObjectLoader; +import org.eclipse.jgit.lib.Ref; +import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.revwalk.RevCommit; +import org.eclipse.jgit.revwalk.RevTree; +import org.eclipse.jgit.revwalk.RevWalk; +import org.eclipse.jgit.storage.file.FileRepositoryBuilder; +import org.eclipse.jgit.transport.UsernamePasswordCredentialsProvider; +import org.eclipse.jgit.treewalk.TreeWalk; + +import jakarta.annotation.Resource; + +/** + * GitClient is a crawler client implementation for accessing resources from Git repositories. + * It extends {@link AbstractCrawlerClient} and provides methods to retrieve content and metadata + * from Git repositories. The client supports various configurations, including authentication. + * + *

+ * The class uses JGit library for Git operations. + *

+ * + *

+ * URL format: git://repository-url/branch/path/to/file + * Example: git://https://github.com/user/repo/master/src/Main.java + *

+ * + * @author shinsuke + */ +public class GitClient extends AbstractCrawlerClient { + + /** Logger instance for this class */ + private static final Logger logger = LogManager.getLogger(GitClient.class); + + /** Property name for Git authentications */ + public static final String GIT_AUTHENTICATIONS_PROPERTY = "gitAuthentications"; + + /** Property name for local repository directory */ + public static final String LOCAL_REPO_DIR_PROPERTY = "localRepoDir"; + + /** Character encoding for Git operations */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length limits */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** The Git authentication holder */ + protected volatile GitAuthenticationHolder gitAuthenticationHolder; + + /** Cache of opened Git repositories */ + protected final Map gitRepositoryCache = new ConcurrentHashMap<>(); + + /** Local directory for storing cloned repositories */ + protected File localRepoDir; + + /** + * Creates a new GitClient instance. + */ + public GitClient() { + // Default constructor + } + + @Override + public synchronized void init() { + if (gitAuthenticationHolder != null) { + return; + } + + if (logger.isDebugEnabled()) { + logger.debug("Initializing GitClient..."); + } + + super.init(); + + // Initialize local repository directory + final String localRepoDirPath = getInitParameter(LOCAL_REPO_DIR_PROPERTY, null, String.class); + if (StringUtil.isNotBlank(localRepoDirPath)) { + localRepoDir = new File(localRepoDirPath); + } else { + try { + localRepoDir = File.createTempFile("git-crawler-", "-repos"); + if (!localRepoDir.delete() || !localRepoDir.mkdirs()) { + throw new CrawlerSystemException("Failed to create local repository directory: " + localRepoDir.getAbsolutePath()); + } + } catch (final Exception e) { + throw new CrawlerSystemException("Failed to create temporary directory for Git repositories", e); + } + } + + // Initialize Git authentication holder + final GitAuthenticationHolder holder = new GitAuthenticationHolder(); + final GitAuthentication[] gitAuthentications = + getInitParameter(GIT_AUTHENTICATIONS_PROPERTY, new GitAuthentication[0], GitAuthentication[].class); + if (gitAuthentications != null) { + for (final GitAuthentication gitAuthentication : gitAuthentications) { + if (logger.isDebugEnabled()) { + logger.debug("Adding GitAuthentication: {}", gitAuthentication); + } + holder.add(gitAuthentication); + } + } + gitAuthenticationHolder = holder; + + if (logger.isInfoEnabled()) { + logger.info("Git client initialized successfully: localRepoDir={}", localRepoDir.getAbsolutePath()); + } + } + + @Override + public void close() { + if (gitAuthenticationHolder == null) { + return; + } + if (logger.isDebugEnabled()) { + logger.debug("Closing GitClient..."); + } + + // Close all cached Git repositories + for (final Git git : gitRepositoryCache.values()) { + try { + git.close(); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to close Git repository", e); + } + } + } + gitRepositoryCache.clear(); + + gitAuthenticationHolder = null; + + if (logger.isDebugEnabled()) { + logger.debug("Git client closed"); + } + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes a Git request to retrieve data from the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the Git request fails + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (gitAuthenticationHolder == null) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data from the Git repository for the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the Git operation fails + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + if (logger.isDebugEnabled()) { + logger.debug("Accessing Git resource: uri={}, includeContent={}", uri, includeContent); + } + + final ResponseData responseData = new ResponseData(); + try { + responseData.setMethod(includeContent ? Constants.GET_METHOD : Constants.HEAD_METHOD); + responseData.setUrl(uri); + + final GitInfo gitInfo = parseGitUri(uri); + final Git git = getOrCloneRepository(gitInfo); + final Repository repository = git.getRepository(); + + // Get the commit for the specified branch/ref + final Ref ref = repository.exactRef("refs/heads/" + gitInfo.getBranch()); + if (ref == null) { + responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + return responseData; + } + + final RevWalk revWalk = new RevWalk(repository); + final RevCommit commit = revWalk.parseCommit(ref.getObjectId()); + final RevTree tree = commit.getTree(); + + if (StringUtil.isBlank(gitInfo.getPath()) || "/".equals(gitInfo.getPath())) { + // Root directory + return processDirectory(uri, includeContent, responseData, repository, tree, gitInfo, ""); + } + + final TreeWalk treeWalk = TreeWalk.forPath(repository, gitInfo.getPath(), tree); + if (treeWalk == null) { + responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + return responseData; + } + + if (treeWalk.isSubtree()) { + // Directory + treeWalk.enterSubtree(); + return processDirectory(uri, includeContent, responseData, repository, tree, gitInfo, gitInfo.getPath()); + } else { + // File + return processFile(uri, includeContent, responseData, repository, treeWalk, gitInfo, commit); + } + + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + } + + /** + * Processes a directory in the Git repository. + */ + protected ResponseData processDirectory(final String uri, final boolean includeContent, final ResponseData responseData, + final Repository repository, final RevTree tree, final GitInfo gitInfo, final String path) throws Exception { + if (logger.isDebugEnabled()) { + logger.debug("Processing Git directory: path={}", path); + } + + final Set requestDataSet = new HashSet<>(); + if (includeContent) { + final TreeWalk treeWalk = new TreeWalk(repository); + treeWalk.addTree(tree); + treeWalk.setRecursive(false); + + if (StringUtil.isNotBlank(path)) { + treeWalk.setFilter(org.eclipse.jgit.treewalk.filter.PathFilter.create(path)); + if (treeWalk.next()) { + treeWalk.enterSubtree(); + } + } + + while (treeWalk.next()) { + final String childPath = treeWalk.getPathString(); + final String childUri = gitInfo.toChildUrl(childPath); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url(childUri).build()); + } + } + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + /** + * Processes a file in the Git repository. + */ + protected ResponseData processFile(final String uri, final boolean includeContent, final ResponseData responseData, + final Repository repository, final TreeWalk treeWalk, final GitInfo gitInfo, final RevCommit commit) throws Exception { + final ObjectId objectId = treeWalk.getObjectId(0); + final ObjectLoader loader = repository.open(objectId); + + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(loader.getSize()); + checkMaxContentLength(responseData); + + responseData.setLastModified(new Date(commit.getCommitTime() * 1000L)); + + if (contentLengthHelper != null) { + final String mimeType = getMimeType(gitInfo.getFilename()); + final long maxLength = contentLengthHelper.getMaxLength(mimeType); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + + maxLength + " byte. The url is " + uri); + } + } + + if (includeContent) { + if (loader.getSize() < maxCachedContentSize) { + try (InputStream in = loader.openStream()) { + responseData.setResponseBody(InputStreamUtil.getBytes(in)); + } + } else { + File outputFile = null; + try { + outputFile = createTempFile("crawler-GitClient-", ".out", null); + try (InputStream in = loader.openStream()) { + CopyUtil.copy(in, outputFile); + } + responseData.setResponseBody(outputFile, true); + } catch (final Exception e) { + logger.warn("Failed to write Git file content to temp file: uri={}, size={}, tempFile={}", uri, loader.getSize(), + outputFile != null ? outputFile.getAbsolutePath() : "null", e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + FileUtil.deleteInBackground(outputFile); + } + } + + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (final InputStream is = responseData.getResponseBody()) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, gitInfo.getFilename())); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, gitInfo.getFilename())); + } + } + + return responseData; + } + + /** + * Gets or clones a Git repository. + */ + protected Git getOrCloneRepository(final GitInfo gitInfo) throws GitAPIException { + final String cacheKey = gitInfo.getRepositoryUrl(); + Git git = gitRepositoryCache.get(cacheKey); + if (git != null) { + return git; + } + + synchronized (gitRepositoryCache) { + git = gitRepositoryCache.get(cacheKey); + if (git != null) { + return git; + } + + final File repoDir = new File(localRepoDir, gitInfo.getRepositoryName()); + final GitAuthentication auth = gitAuthenticationHolder.get(gitInfo.getRepositoryUrl()); + + try { + if (repoDir.exists()) { + // Open existing repository + final Repository repository = + new FileRepositoryBuilder().setGitDir(new File(repoDir, ".git")).readEnvironment().findGitDir().build(); + git = new Git(repository); + } else { + // Clone repository + if (logger.isInfoEnabled()) { + logger.info("Cloning Git repository: url={}, dir={}", gitInfo.getRepositoryUrl(), repoDir.getAbsolutePath()); + } + git = Git.cloneRepository() + .setURI(gitInfo.getRepositoryUrl()) + .setDirectory(repoDir) + .setCredentialsProvider(createCredentialsProvider(auth)) + .call(); + } + + gitRepositoryCache.put(cacheKey, git); + return git; + } catch (final Exception e) { + throw new CrawlingAccessException("Failed to clone or open Git repository: " + gitInfo.getRepositoryUrl(), e); + } + } + } + + /** + * Creates a credentials provider for Git authentication. + */ + protected UsernamePasswordCredentialsProvider createCredentialsProvider(final GitAuthentication auth) { + if (auth != null && StringUtil.isNotBlank(auth.getUsername())) { + return new UsernamePasswordCredentialsProvider(auth.getUsername(), auth.getPassword()); + } + return null; + } + + /** + * Parses a Git URI into components. + */ + protected GitInfo parseGitUri(final String uri) { + if (!uri.startsWith("git://")) { + throw new CrawlingAccessException("Invalid Git URI: " + uri); + } + + final String remainder = uri.substring(6); // Remove "git://" + final String[] parts = remainder.split("/", 3); + + if (parts.length < 2) { + throw new CrawlingAccessException("Invalid Git URI format. Expected: git://repository-url/branch[/path]: " + uri); + } + + final String repositoryUrl = parts[0]; + final String branch = parts[1]; + final String path = parts.length > 2 ? parts[2] : ""; + + return new GitInfo(repositoryUrl, branch, path); + } + + /** + * Gets the MIME type for a filename. + */ + protected String getMimeType(final String filename) { + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + return mimeTypeHelper.getContentType(null, filename); + } + + /** + * Inner class to hold Git URI information. + */ + public static class GitInfo { + private final String repositoryUrl; + private final String branch; + private final String path; + + public GitInfo(final String repositoryUrl, final String branch, final String path) { + this.repositoryUrl = repositoryUrl; + this.branch = branch; + this.path = path; + } + + public String getRepositoryUrl() { + return repositoryUrl; + } + + public String getBranch() { + return branch; + } + + public String getPath() { + return path; + } + + public String getRepositoryName() { + try { + final URI uri = new URI(repositoryUrl); + String name = uri.getPath(); + if (name.endsWith(".git")) { + name = name.substring(0, name.length() - 4); + } + if (name.startsWith("/")) { + name = name.substring(1); + } + return name.replace("/", "_"); + } catch (final Exception e) { + return Integer.toHexString(repositoryUrl.hashCode()); + } + } + + public String getFilename() { + if (StringUtil.isBlank(path)) { + return ""; + } + final int index = path.lastIndexOf('/'); + if (index >= 0 && index < path.length() - 1) { + return path.substring(index + 1); + } + return path; + } + + public String toChildUrl(final String childPath) { + return "git://" + repositoryUrl + "/" + branch + "/" + childPath; + } + } + + /** + * Gets the character encoding used for Git operations. + * + * @return The character encoding + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding used for Git operations. + * + * @param charset The character encoding to set + */ + public void setCharset(final String charset) { + this.charset = charset; + } + + /** + * Sets the Git authentication holder. + * + * @param gitAuthenticationHolder The Git authentication holder to set + */ + public void setGitAuthenticationHolder(final GitAuthenticationHolder gitAuthenticationHolder) { + this.gitAuthenticationHolder = gitAuthenticationHolder; + } + + /** + * Gets the local repository directory. + * + * @return The local repository directory + */ + public File getLocalRepoDir() { + return localRepoDir; + } + + /** + * Sets the local repository directory. + * + * @param localRepoDir The local repository directory to set + */ + public void setLocalRepoDir(final File localRepoDir) { + this.localRepoDir = localRepoDir; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthentication.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthentication.java new file mode 100644 index 00000000..4beebd2c --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthentication.java @@ -0,0 +1,178 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.sftp; + +import java.util.regex.Pattern; + +import org.codelibs.core.lang.StringUtil; + +/** + * Authentication information for SFTP connections. + * This class holds credentials and server details for SFTP authentication. + * + * @author shinsuke + */ +public class SftpAuthentication { + + /** The server URL pattern for matching. */ + protected Pattern serverPattern; + + /** The port number for the SFTP server. */ + protected int port = 22; + + /** The username for authentication. */ + protected String username; + + /** The password for authentication. */ + protected String password; + + /** The private key for public key authentication. */ + protected String privateKey; + + /** The passphrase for the private key. */ + protected String passphrase; + + /** + * Creates a new SftpAuthentication instance. + */ + public SftpAuthentication() { + // Default constructor + } + + /** + * Gets the server pattern. + * + * @return The server pattern. + */ + public Pattern getServerPattern() { + return serverPattern; + } + + /** + * Sets the server pattern. + * + * @param serverPattern The server pattern to set. + */ + public void setServerPattern(final Pattern serverPattern) { + this.serverPattern = serverPattern; + } + + /** + * Sets the server pattern from a string. + * + * @param serverPattern The server pattern string. + */ + public void setServer(final String serverPattern) { + if (StringUtil.isNotBlank(serverPattern)) { + this.serverPattern = Pattern.compile(serverPattern); + } + } + + /** + * Gets the port number. + * + * @return The port number. + */ + public int getPort() { + return port; + } + + /** + * Sets the port number. + * + * @param port The port number to set. + */ + public void setPort(final int port) { + this.port = port; + } + + /** + * Gets the username. + * + * @return The username. + */ + public String getUsername() { + return username; + } + + /** + * Sets the username. + * + * @param username The username to set. + */ + public void setUsername(final String username) { + this.username = username; + } + + /** + * Gets the password. + * + * @return The password. + */ + public String getPassword() { + return password; + } + + /** + * Sets the password. + * + * @param password The password to set. + */ + public void setPassword(final String password) { + this.password = password; + } + + /** + * Gets the private key. + * + * @return The private key. + */ + public String getPrivateKey() { + return privateKey; + } + + /** + * Sets the private key. + * + * @param privateKey The private key to set. + */ + public void setPrivateKey(final String privateKey) { + this.privateKey = privateKey; + } + + /** + * Gets the passphrase. + * + * @return The passphrase. + */ + public String getPassphrase() { + return passphrase; + } + + /** + * Sets the passphrase. + * + * @param passphrase The passphrase to set. + */ + public void setPassphrase(final String passphrase) { + this.passphrase = passphrase; + } + + @Override + public String toString() { + return "SftpAuthentication [serverPattern=" + serverPattern + ", port=" + port + ", username=" + username + "]"; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthenticationHolder.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthenticationHolder.java new file mode 100644 index 00000000..2cf8243c --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpAuthenticationHolder.java @@ -0,0 +1,72 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.sftp; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Holder for SFTP authentication information. + * This class manages multiple SFTP authentication credentials and matches them to server URLs. + * + * @author shinsuke + */ +public class SftpAuthenticationHolder { + + /** List of SFTP authentications. */ + protected List sftpAuthenticationList = new ArrayList<>(); + + /** + * Creates a new SftpAuthenticationHolder instance. + */ + public SftpAuthenticationHolder() { + // Default constructor + } + + /** + * Adds an SFTP authentication to the holder. + * + * @param sftpAuthentication The SFTP authentication to add. + */ + public void add(final SftpAuthentication sftpAuthentication) { + sftpAuthenticationList.add(sftpAuthentication); + } + + /** + * Gets the SFTP authentication that matches the given URL. + * + * @param url The URL to match. + * @return The matching SFTP authentication, or null if no match is found. + */ + public SftpAuthentication get(final String url) { + if (url == null) { + return null; + } + + for (final SftpAuthentication sftpAuthentication : sftpAuthenticationList) { + final Pattern pattern = sftpAuthentication.getServerPattern(); + if (pattern != null) { + final Matcher matcher = pattern.matcher(url); + if (matcher.matches()) { + return sftpAuthentication; + } + } + } + return null; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpClient.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpClient.java new file mode 100644 index 00000000..018e83cc --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/sftp/SftpClient.java @@ -0,0 +1,717 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.sftp; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Date; +import java.util.HashSet; +import java.util.Queue; +import java.util.Set; +import java.util.Vector; +import java.util.concurrent.ConcurrentLinkedQueue; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; + +import com.jcraft.jsch.ChannelSftp; +import com.jcraft.jsch.JSch; +import com.jcraft.jsch.JSchException; +import com.jcraft.jsch.Session; +import com.jcraft.jsch.SftpATTRS; +import com.jcraft.jsch.SftpException; + +import jakarta.annotation.Resource; + +/** + * SftpClient is a crawler client implementation for accessing resources via the SFTP protocol. + * It extends {@link AbstractCrawlerClient} and provides methods to retrieve content and metadata + * from SFTP servers. The client supports various configurations, including authentication, timeouts, + * and encoding settings. + * + *

+ * The class uses JSch library for SFTP communication. It maintains a queue of ChannelSftp + * instances to improve performance by reusing connections. + *

+ * + *

+ * The client can be configured with SFTP-specific settings via init parameters, such as: + *

+ *
    + *
  • connectTimeout: The timeout for establishing a connection to the SFTP server.
  • + *
  • charset: The character encoding for file operations.
  • + *
  • strictHostKeyChecking: Whether to strictly check host keys (default: no).
  • + *
  • sftpAuthentications: An array of {@link SftpAuthentication} objects for different SFTP URLs.
  • + *
+ * + * @author shinsuke + */ +public class SftpClient extends AbstractCrawlerClient { + + /** Logger instance for this class */ + private static final Logger logger = LogManager.getLogger(SftpClient.class); + + /** Metadata key for SFTP file owner */ + public static final String SFTP_FILE_OWNER = "sftpFileOwner"; + + /** Metadata key for SFTP file group */ + public static final String SFTP_FILE_GROUP = "sftpFileGroup"; + + /** Metadata key for SFTP file permissions */ + public static final String SFTP_FILE_PERMISSIONS = "sftpFilePermissions"; + + /** Property name for SFTP authentications */ + public static final String SFTP_AUTHENTICATIONS_PROPERTY = "sftpAuthentications"; + + /** Character encoding for SFTP operations */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length limits */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** The SFTP authentication holder */ + protected volatile SftpAuthenticationHolder sftpAuthenticationHolder; + + /** The queue of ChannelSftp instances */ + protected final Queue sftpChannelQueue = new ConcurrentLinkedQueue<>(); + + /** The queue of Session instances */ + protected final Queue sessionQueue = new ConcurrentLinkedQueue<>(); + + /** The connect timeout */ + protected int connectTimeout = 10000; + + /** Whether to strictly check host keys */ + protected String strictHostKeyChecking = "no"; + + /** + * Creates a new SftpClient instance. + */ + public SftpClient() { + // Default constructor + } + + @Override + public synchronized void init() { + if (sftpAuthenticationHolder != null) { + return; + } + + if (logger.isDebugEnabled()) { + logger.debug("Initializing SftpClient..."); + } + + super.init(); + + connectTimeout = getInitParameter("connectTimeout", connectTimeout, Integer.class); + strictHostKeyChecking = getInitParameter("strictHostKeyChecking", strictHostKeyChecking, String.class); + + // Initialize SFTP authentication holder + final SftpAuthenticationHolder holder = new SftpAuthenticationHolder(); + final SftpAuthentication[] sftpAuthentications = + getInitParameter(SFTP_AUTHENTICATIONS_PROPERTY, new SftpAuthentication[0], SftpAuthentication[].class); + if (sftpAuthentications != null) { + for (final SftpAuthentication sftpAuthentication : sftpAuthentications) { + if (logger.isDebugEnabled()) { + logger.debug("Adding SftpAuthentication: {}", sftpAuthentication); + } + holder.add(sftpAuthentication); + } + } + sftpAuthenticationHolder = holder; + + if (logger.isInfoEnabled()) { + logger.info("SFTP client initialized successfully: connectTimeout={}ms, strictHostKeyChecking={}", connectTimeout, + strictHostKeyChecking); + } + } + + @Override + public void close() { + if (sftpAuthenticationHolder == null) { + return; + } + if (logger.isDebugEnabled()) { + logger.debug("Closing SftpClient..."); + } + sftpAuthenticationHolder = null; + + for (final ChannelSftp channel : sftpChannelQueue) { + try { + if (channel.isConnected()) { + channel.disconnect(); + } + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to disconnect SFTP channel: connected={}", channel.isConnected(), e); + } + } + } + + for (final Session session : sessionQueue) { + try { + if (session.isConnected()) { + session.disconnect(); + } + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to disconnect SFTP session: connected={}", session.isConnected(), e); + } + } + } + + if (logger.isDebugEnabled()) { + logger.debug("SFTP client closed"); + } + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes an SFTP request to retrieve data from the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the SFTP request fails + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (sftpAuthenticationHolder == null) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data from the SFTP server for the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the SFTP operation fails + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + final ResponseData responseData = new ResponseData(); + ChannelSftp channel = null; + try { + responseData.setMethod(Constants.GET_METHOD); + + final SftpInfo sftpInfo = new SftpInfo(uri, charset); + responseData.setUrl(sftpInfo.toUrl()); + + channel = getChannel(sftpInfo); + + if (sftpInfo.getPath() == null || sftpInfo.getPath().isEmpty() || "/".equals(sftpInfo.getPath())) { + // root directory + final Set requestDataSet = new HashSet<>(); + if (includeContent) { + try { + @SuppressWarnings("unchecked") + final Vector files = channel.ls("/"); + for (final ChannelSftp.LsEntry entry : files) { + if (!".".equals(entry.getFilename()) && !"..".equals(entry.getFilename())) { + final String childUri = sftpInfo.toChildUrl(entry.getFilename()); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url(childUri).build()); + } + } + } catch (final SftpException e) { + disconnectInternalChannel(channel); + throw new CrawlingAccessException("Could not access " + uri, e); + } + } + sftpChannelQueue.offer(channel); + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + SftpATTRS attrs = null; + try { + attrs = channel.stat(sftpInfo.getPath()); + } catch (final SftpException e) { + if (logger.isDebugEnabled()) { + logger.debug("File not found: {}", sftpInfo.getPath()); + } + } + + updateResponseData(uri, includeContent, responseData, channel, sftpInfo, attrs); + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + + return responseData; + } + + /** + * Disconnects the internal SFTP channel and logs any errors. + * + * @param channel The SFTP channel to disconnect + */ + protected void disconnectInternalChannel(final ChannelSftp channel) { + try { + if (channel.isConnected()) { + channel.disconnect(); + } + } catch (final Exception e) { + logger.warn("Failed to disconnect SFTP channel: connected={}", channel.isConnected(), e); + } + } + + /** + * Updates the response data based on the SFTP file information. + * + * @param uri The original URI being accessed + * @param includeContent Whether to include the actual content in the response + * @param responseData The response data to update + * @param channel The SFTP channel used for the operation + * @param sftpInfo Information about the SFTP connection + * @param attrs The SFTP file attributes, or null if not found + */ + protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, + final ChannelSftp channel, final SftpInfo sftpInfo, final SftpATTRS attrs) { + if (attrs == null) { + responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + sftpChannelQueue.offer(channel); + return; + } + + if (attrs.isDir()) { + if (logger.isDebugEnabled()) { + logger.debug("Processing SFTP directory: {}", sftpInfo.getPath()); + } + final Set requestDataSet = new HashSet<>(); + if (includeContent) { + try { + @SuppressWarnings("unchecked") + final Vector files = channel.ls(sftpInfo.getPath()); + if (logger.isDebugEnabled()) { + logger.debug("Found {} entries in directory: {}", files.size(), sftpInfo.getPath()); + } + for (final ChannelSftp.LsEntry entry : files) { + if (!".".equals(entry.getFilename()) && !"..".equals(entry.getFilename())) { + final String childUri = sftpInfo.toChildUrl(entry.getFilename()); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url(childUri).build()); + } + } + } catch (final SftpException e) { + disconnectInternalChannel(channel); + throw new CrawlingAccessException("Could not access " + uri, e); + } + } + sftpChannelQueue.offer(channel); + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + if (attrs.isReg()) { + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(Constants.UTF_8); + responseData.setLastModified(new Date(attrs.getMTime() * 1000L)); + + // check file size + responseData.setContentLength(attrs.getSize()); + checkMaxContentLength(responseData); + + // Set file metadata + responseData.addMetaData(SFTP_FILE_OWNER, String.valueOf(attrs.getUId())); + responseData.addMetaData(SFTP_FILE_GROUP, String.valueOf(attrs.getGId())); + responseData.addMetaData(SFTP_FILE_PERMISSIONS, attrs.getPermissionsString()); + + if (includeContent) { + File tempFile = null; + File outputFile = null; + try { + tempFile = createTempFile("sftp-", ".tmp", null); + try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile)); + InputStream in = channel.get(sftpInfo.getPath())) { + CopyUtil.copy(in, out); + } + + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (InputStream is = new BufferedInputStream(new java.io.FileInputStream(tempFile))) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, sftpInfo.getFilename())); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, sftpInfo.getFilename())); + } + + if (contentLengthHelper != null) { + final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType()); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + + " byte) is over " + maxLength + " byte. The url is " + uri); + } + } + + responseData.setCharSet(getCharSet(tempFile)); + + if (tempFile.length() < maxCachedContentSize) { + try (InputStream contentStream = new BufferedInputStream(new java.io.FileInputStream(tempFile))) { + responseData.setResponseBody(InputStreamUtil.getBytes(contentStream)); + } + } else { + outputFile = createTempFile("crawler-SftpClient-", ".out", null); + CopyUtil.copy(tempFile, outputFile); + responseData.setResponseBody(outputFile, true); + if (logger.isDebugEnabled()) { + logger.debug( + "File size exceeds cache threshold, using temp file: path={}, size={}, threshold={}, tempFile={}", + sftpInfo.getPath(), attrs.getSize(), maxCachedContentSize, outputFile.getAbsolutePath()); + } + } + sftpChannelQueue.offer(channel); + } catch (final CrawlingAccessException e) { + sftpChannelQueue.offer(channel); + throw e; + } catch (final Exception e) { + logger.warn("Failed to retrieve SFTP file content: uri={}, path={}, size={}", uri, sftpInfo.getPath(), attrs.getSize(), + e); + disconnectInternalChannel(channel); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + } finally { + FileUtil.deleteInBackground(tempFile); + } + } else { + sftpChannelQueue.offer(channel); + } + } else { + responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + sftpChannelQueue.offer(channel); + } + } + + /** + * Determines the character set for the given file. + * + * @param file The file to determine the charset for + * @return The character set name + */ + protected String getCharSet(final File file) { + return charset; + } + + /** + * Gets the character encoding used for SFTP operations. + * + * @return The character encoding + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding used for SFTP operations. + * + * @param charset The character encoding to set + */ + public void setCharset(final String charset) { + this.charset = charset; + } + + /** + * Gets or creates an SFTP channel for the specified SFTP information. + * + * @param info The SFTP information containing host, port, and other connection details + * @return A configured SFTP channel ready for use + * @throws JSchException If the SFTP channel cannot be created or connected + */ + protected ChannelSftp getChannel(final SftpInfo info) throws JSchException { + ChannelSftp channel = sftpChannelQueue.poll(); + if (channel != null && channel.isConnected()) { + return channel; + } + + Session session = null; + try { + final JSch jsch = new JSch(); + final SftpAuthentication auth = sftpAuthenticationHolder.get(info.toUrl()); + + if (auth != null && StringUtil.isNotBlank(auth.getPrivateKey())) { + if (StringUtil.isNotBlank(auth.getPassphrase())) { + jsch.addIdentity("sftp-key", auth.getPrivateKey().getBytes(Constants.UTF_8_CHARSET), null, + auth.getPassphrase().getBytes(Constants.UTF_8_CHARSET)); + } else { + jsch.addIdentity("sftp-key", auth.getPrivateKey().getBytes(Constants.UTF_8_CHARSET), null, null); + } + } + + final int port = auth != null ? auth.getPort() : info.getPort(); + final String username = auth != null && auth.getUsername() != null ? auth.getUsername() : "anonymous"; + + session = jsch.getSession(username, info.getHost(), port); + + if (auth != null && StringUtil.isNotBlank(auth.getPassword())) { + session.setPassword(auth.getPassword()); + } + + session.setConfig("StrictHostKeyChecking", strictHostKeyChecking); + session.setTimeout(connectTimeout); + session.connect(); + + channel = (ChannelSftp) session.openChannel("sftp"); + channel.connect(); + + sessionQueue.offer(session); + return channel; + } catch (final JSchException e) { + if (session != null && session.isConnected()) { + session.disconnect(); + } + throw e; + } + } + + /** + * SftpInfo is a helper class that encapsulates information about an SFTP URL. + */ + public static class SftpInfo { + + private static final int DEFAULT_SFTP_PORT = 22; + + private URI uri; + + private String path; + + /** + * Constructs a new SftpInfo from a URL string. + * + * @param s The URL string to parse + * @param c The character encoding (not currently used) + * @throws CrawlingAccessException If the URL is invalid or malformed + */ + public SftpInfo(final String s, final String c) { + if (StringUtil.isBlank(s)) { + throw new CrawlingAccessException("uri is blank."); + } + + try { + uri = new URI(normalize(s).replace(" ", "%20")); + } catch (final URISyntaxException e) { + throw new CrawlingAccessException("Invalid URL: " + s, e); + } + + if (!"sftp".equals(uri.getScheme())) { + throw new CrawlingAccessException("Invalid scheme: " + uri.getScheme()); + } + + path = uri.getPath(); + if (path == null || path.isEmpty()) { + path = "/"; + } + } + + /** + * Normalizes the URL string. + * + * @param s The URL string to normalize + * @return The normalized URL string + */ + protected String normalize(final String s) { + if (s == null) { + return null; + } + String url = s.replaceAll("/+", "/").replace("sftp:/", "sftp://"); + while (url.indexOf("/../") != -1) { + url = url.replaceFirst("/[^/]+/\\.\\./", "/"); + } + return url; + } + + /** + * Gets the host name from the SFTP URL. + * + * @return The host name + */ + public String getHost() { + return uri.getHost(); + } + + /** + * Gets the port number from the SFTP URL. + * + * @return The port number + */ + public int getPort() { + int port = uri.getPort(); + if (port == -1) { + port = DEFAULT_SFTP_PORT; + } + return port; + } + + /** + * Gets the path from the SFTP URL. + * + * @return The path + */ + public String getPath() { + return path; + } + + /** + * Gets the filename from the path. + * + * @return The filename + */ + public String getFilename() { + if (path == null || path.isEmpty() || "/".equals(path)) { + return ""; + } + final int index = path.lastIndexOf('/'); + if (index >= 0 && index < path.length() - 1) { + return path.substring(index + 1); + } + return path; + } + + /** + * Constructs a complete SFTP URL. + * + * @return The complete SFTP URL + */ + public String toUrl() { + final StringBuilder buf = new StringBuilder(100); + buf.append("sftp://"); + buf.append(getHost()); + final int port = getPort(); + if (port != DEFAULT_SFTP_PORT) { + buf.append(':').append(port); + } + buf.append(path); + return normalize(buf.toString()); + } + + /** + * Constructs a child URL by appending the specified child path. + * + * @param child The child path to append + * @return The complete child URL + */ + public String toChildUrl(final String child) { + final String url = toUrl(); + if (url.endsWith("/")) { + return normalize(url + child); + } + return normalize(url + "/" + child); + } + } + + /** + * Gets the connection timeout in milliseconds. + * + * @return The connection timeout + */ + public int getConnectTimeout() { + return connectTimeout; + } + + /** + * Sets the connection timeout in milliseconds. + * + * @param connectTimeout The connection timeout + */ + public void setConnectTimeout(final int connectTimeout) { + this.connectTimeout = connectTimeout; + } + + /** + * Gets the strict host key checking setting. + * + * @return The strict host key checking setting + */ + public String getStrictHostKeyChecking() { + return strictHostKeyChecking; + } + + /** + * Sets the strict host key checking setting. + * + * @param strictHostKeyChecking The strict host key checking setting + */ + public void setStrictHostKeyChecking(final String strictHostKeyChecking) { + this.strictHostKeyChecking = strictHostKeyChecking; + } + + /** + * Sets the SFTP authentication holder. + * + * @param sftpAuthenticationHolder The SFTP authentication holder to set + */ + public void setSftpAuthenticationHolder(final SftpAuthenticationHolder sftpAuthenticationHolder) { + this.sftpAuthenticationHolder = sftpAuthenticationHolder; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthentication.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthentication.java new file mode 100644 index 00000000..f21766e1 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthentication.java @@ -0,0 +1,115 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.webdav; + +import java.util.regex.Pattern; + +import org.codelibs.core.lang.StringUtil; + +/** + * Authentication information for WebDAV connections. + * This class holds credentials for WebDAV authentication. + * + * @author shinsuke + */ +public class WebDavAuthentication { + + /** The server URL pattern for matching. */ + protected Pattern serverPattern; + + /** The username for authentication. */ + protected String username; + + /** The password for authentication. */ + protected String password; + + /** + * Creates a new WebDavAuthentication instance. + */ + public WebDavAuthentication() { + // Default constructor + } + + /** + * Gets the server pattern. + * + * @return The server pattern. + */ + public Pattern getServerPattern() { + return serverPattern; + } + + /** + * Sets the server pattern. + * + * @param serverPattern The server pattern to set. + */ + public void setServerPattern(final Pattern serverPattern) { + this.serverPattern = serverPattern; + } + + /** + * Sets the server pattern from a string. + * + * @param serverPattern The server pattern string. + */ + public void setServer(final String serverPattern) { + if (StringUtil.isNotBlank(serverPattern)) { + this.serverPattern = Pattern.compile(serverPattern); + } + } + + /** + * Gets the username. + * + * @return The username. + */ + public String getUsername() { + return username; + } + + /** + * Sets the username. + * + * @param username The username to set. + */ + public void setUsername(final String username) { + this.username = username; + } + + /** + * Gets the password. + * + * @return The password. + */ + public String getPassword() { + return password; + } + + /** + * Sets the password. + * + * @param password The password to set. + */ + public void setPassword(final String password) { + this.password = password; + } + + @Override + public String toString() { + return "WebDavAuthentication [serverPattern=" + serverPattern + ", username=" + username + "]"; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthenticationHolder.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthenticationHolder.java new file mode 100644 index 00000000..a4e52023 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavAuthenticationHolder.java @@ -0,0 +1,72 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.webdav; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Holder for WebDAV authentication information. + * This class manages multiple WebDAV authentication credentials and matches them to server URLs. + * + * @author shinsuke + */ +public class WebDavAuthenticationHolder { + + /** List of WebDAV authentications. */ + protected List webDavAuthenticationList = new ArrayList<>(); + + /** + * Creates a new WebDavAuthenticationHolder instance. + */ + public WebDavAuthenticationHolder() { + // Default constructor + } + + /** + * Adds a WebDAV authentication to the holder. + * + * @param webDavAuthentication The WebDAV authentication to add. + */ + public void add(final WebDavAuthentication webDavAuthentication) { + webDavAuthenticationList.add(webDavAuthentication); + } + + /** + * Gets the WebDAV authentication that matches the given URL. + * + * @param url The URL to match. + * @return The matching WebDAV authentication, or null if no match is found. + */ + public WebDavAuthentication get(final String url) { + if (url == null) { + return null; + } + + for (final WebDavAuthentication webDavAuthentication : webDavAuthenticationList) { + final Pattern pattern = webDavAuthentication.getServerPattern(); + if (pattern != null) { + final Matcher matcher = pattern.matcher(url); + if (matcher.matches()) { + return webDavAuthentication; + } + } + } + return null; + } +} diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavClient.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavClient.java new file mode 100644 index 00000000..be468542 --- /dev/null +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/client/webdav/WebDavClient.java @@ -0,0 +1,403 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.webdav; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.CopyUtil; +import org.codelibs.core.io.FileUtil; +import org.codelibs.core.io.InputStreamUtil; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.timer.TimeoutManager; +import org.codelibs.core.timer.TimeoutTask; +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.builder.RequestDataBuilder; +import org.codelibs.fess.crawler.client.AbstractCrawlerClient; +import org.codelibs.fess.crawler.client.AccessTimeoutTarget; +import org.codelibs.fess.crawler.entity.RequestData; +import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.helper.ContentLengthHelper; +import org.codelibs.fess.crawler.helper.MimeTypeHelper; + +import com.github.sardine.DavResource; +import com.github.sardine.Sardine; +import com.github.sardine.SardineFactory; + +import jakarta.annotation.Resource; + +/** + * WebDavClient is a crawler client implementation for accessing resources via the WebDAV protocol. + * It extends {@link AbstractCrawlerClient} and provides methods to retrieve content and metadata + * from WebDAV servers. The client supports various configurations, including authentication and timeouts. + * + *

+ * The class uses Sardine library for WebDAV communication. + *

+ * + *

+ * The client can be configured with WebDAV-specific settings via init parameters, such as: + *

+ *
    + *
  • charset: The character encoding for file operations.
  • + *
  • webDavAuthentications: An array of {@link WebDavAuthentication} objects for different WebDAV URLs.
  • + *
+ * + * @author shinsuke + */ +public class WebDavClient extends AbstractCrawlerClient { + + /** Logger instance for this class */ + private static final Logger logger = LogManager.getLogger(WebDavClient.class); + + /** Property name for WebDAV authentications */ + public static final String WEBDAV_AUTHENTICATIONS_PROPERTY = "webDavAuthentications"; + + /** Character encoding for WebDAV operations */ + protected String charset = Constants.UTF_8; + + /** Helper for managing content length limits */ + @Resource + protected ContentLengthHelper contentLengthHelper; + + /** The WebDAV authentication holder */ + protected volatile WebDavAuthenticationHolder webDavAuthenticationHolder; + + /** + * Creates a new WebDavClient instance. + */ + public WebDavClient() { + // Default constructor + } + + @Override + public synchronized void init() { + if (webDavAuthenticationHolder != null) { + return; + } + + if (logger.isDebugEnabled()) { + logger.debug("Initializing WebDavClient..."); + } + + super.init(); + + // Initialize WebDAV authentication holder + final WebDavAuthenticationHolder holder = new WebDavAuthenticationHolder(); + final WebDavAuthentication[] webDavAuthentications = + getInitParameter(WEBDAV_AUTHENTICATIONS_PROPERTY, new WebDavAuthentication[0], WebDavAuthentication[].class); + if (webDavAuthentications != null) { + for (final WebDavAuthentication webDavAuthentication : webDavAuthentications) { + if (logger.isDebugEnabled()) { + logger.debug("Adding WebDavAuthentication: {}", webDavAuthentication); + } + holder.add(webDavAuthentication); + } + } + webDavAuthenticationHolder = holder; + + if (logger.isInfoEnabled()) { + logger.info("WebDAV client initialized successfully"); + } + } + + @Override + public void close() { + if (webDavAuthenticationHolder == null) { + return; + } + if (logger.isDebugEnabled()) { + logger.debug("Closing WebDavClient..."); + } + webDavAuthenticationHolder = null; + } + + @Override + public ResponseData doGet(final String uri) { + return processRequest(uri, true); + } + + @Override + public ResponseData doHead(final String url) { + try { + final ResponseData responseData = processRequest(url, false); + responseData.setMethod(Constants.HEAD_METHOD); + return responseData; + } catch (final ChildUrlsException e) { + return null; + } + } + + /** + * Processes a WebDAV request to retrieve data from the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the WebDAV request fails + */ + protected ResponseData processRequest(final String uri, final boolean includeContent) { + if (webDavAuthenticationHolder == null) { + init(); + } + + // start + AccessTimeoutTarget accessTimeoutTarget = null; + TimeoutTask accessTimeoutTask = null; + if (accessTimeout != null) { + accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread()); + accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout, false); + } + + try { + return getResponseData(uri, includeContent); + } finally { + if (accessTimeoutTarget != null) { + accessTimeoutTarget.stop(); + if (accessTimeoutTask != null && !accessTimeoutTask.isCanceled()) { + accessTimeoutTask.cancel(); + } + } + } + } + + /** + * Retrieves response data from the WebDAV server for the specified URI. + * + * @param uri The URI to retrieve data from + * @param includeContent Whether to include the actual content in the response + * @return The response data containing the retrieved information + * @throws CrawlingAccessException If the WebDAV operation fails + */ + protected ResponseData getResponseData(final String uri, final boolean includeContent) { + if (logger.isDebugEnabled()) { + logger.debug("Accessing WebDAV resource: uri={}, includeContent={}", uri, includeContent); + } + + final ResponseData responseData = new ResponseData(); + try { + responseData.setMethod(includeContent ? Constants.GET_METHOD : Constants.HEAD_METHOD); + final String normalizedUri = normalizeUri(uri); + responseData.setUrl(normalizedUri); + + final Sardine sardine = createSardine(normalizedUri); + + if (!sardine.exists(normalizedUri)) { + responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + return responseData; + } + + final List resources = sardine.list(normalizedUri, 0); + if (resources == null || resources.isEmpty()) { + responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(0); + return responseData; + } + + final DavResource resource = resources.get(0); + + if (resource.isDirectory()) { + if (logger.isDebugEnabled()) { + logger.debug("Processing WebDAV directory: {}", normalizedUri); + } + final Set requestDataSet = new HashSet<>(); + if (includeContent) { + final List children = sardine.list(normalizedUri, 1); + if (logger.isDebugEnabled()) { + logger.debug("Found {} entries in directory: {}", children.size(), normalizedUri); + } + for (int i = 1; i < children.size(); i++) { + final DavResource child = children.get(i); + final String childUri = child.getHref().toString(); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url(childUri).build()); + } + } + throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData"); + } + + // File resource + responseData.setHttpStatusCode(Constants.OK_STATUS_CODE); + responseData.setCharSet(charset); + responseData.setContentLength(resource.getContentLength()); + checkMaxContentLength(responseData); + + if (resource.getModified() != null) { + responseData.setLastModified(resource.getModified()); + } else if (resource.getCreation() != null) { + responseData.setLastModified(resource.getCreation()); + } else { + responseData.setLastModified(new Date()); + } + + if (StringUtil.isNotBlank(resource.getContentType())) { + responseData.setMimeType(resource.getContentType()); + } + + if (contentLengthHelper != null) { + final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType()); + if (responseData.getContentLength() > maxLength) { + throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + + maxLength + " byte. The url is " + normalizedUri); + } + } + + if (includeContent) { + if (resource.getContentLength() < maxCachedContentSize) { + try (InputStream contentStream = new BufferedInputStream(sardine.get(normalizedUri))) { + responseData.setResponseBody(InputStreamUtil.getBytes(contentStream)); + } catch (final Exception e) { + logger.warn("Failed to read WebDAV content: uri={}, size={}", normalizedUri, resource.getContentLength(), e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + } + } else { + File outputFile = null; + try { + outputFile = createTempFile("crawler-WebDavClient-", ".out", null); + try (InputStream in = sardine.get(normalizedUri)) { + CopyUtil.copy(in, outputFile); + } + responseData.setResponseBody(outputFile, true); + if (logger.isDebugEnabled()) { + logger.debug( + "File size exceeds cache threshold, using temp file: uri={}, size={}, threshold={}, tempFile={}", + normalizedUri, resource.getContentLength(), maxCachedContentSize, outputFile.getAbsolutePath()); + } + } catch (final Exception e) { + logger.warn("Failed to write WebDAV content to temp file: uri={}, size={}, tempFile={}", normalizedUri, + resource.getContentLength(), outputFile != null ? outputFile.getAbsolutePath() : "null", e); + responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE); + FileUtil.deleteInBackground(outputFile); + } + } + + if (StringUtil.isBlank(responseData.getMimeType())) { + final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper"); + try (final InputStream is = responseData.getResponseBody()) { + responseData.setMimeType(mimeTypeHelper.getContentType(is, getFileName(normalizedUri))); + } catch (final Exception e) { + responseData.setMimeType(mimeTypeHelper.getContentType(null, getFileName(normalizedUri))); + } + } + } + + } catch (final CrawlerSystemException e) { + CloseableUtil.closeQuietly(responseData); + throw e; + } catch (final IOException e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } catch (final Exception e) { + CloseableUtil.closeQuietly(responseData); + throw new CrawlingAccessException("Could not access " + uri, e); + } + + return responseData; + } + + /** + * Creates a Sardine client for the given URI. + * + * @param uri The URI to create a client for + * @return A Sardine client + */ + protected Sardine createSardine(final String uri) { + final WebDavAuthentication auth = webDavAuthenticationHolder.get(uri); + if (auth != null && StringUtil.isNotBlank(auth.getUsername())) { + if (logger.isDebugEnabled()) { + logger.debug("Creating authenticated Sardine client for: {}", uri); + } + return SardineFactory.begin(auth.getUsername(), auth.getPassword()); + } + if (logger.isDebugEnabled()) { + logger.debug("Creating anonymous Sardine client for: {}", uri); + } + return SardineFactory.begin(); + } + + /** + * Normalizes the URI. + * + * @param uri The URI to normalize + * @return The normalized URI + * @throws CrawlerSystemException If the URI is empty + */ + protected String normalizeUri(final String uri) { + if (StringUtil.isEmpty(uri)) { + throw new CrawlerSystemException("The uri is empty."); + } + return uri; + } + + /** + * Extracts the file name from the URI. + * + * @param uri The URI + * @return The file name + */ + protected String getFileName(final String uri) { + if (uri == null || uri.endsWith("/")) { + return ""; + } + final int index = uri.lastIndexOf('/'); + if (index >= 0 && index < uri.length() - 1) { + return uri.substring(index + 1); + } + return uri; + } + + /** + * Gets the character encoding used for WebDAV operations. + * + * @return The character encoding + */ + public String getCharset() { + return charset; + } + + /** + * Sets the character encoding used for WebDAV operations. + * + * @param charset The character encoding to set + */ + public void setCharset(final String charset) { + this.charset = charset; + } + + /** + * Sets the WebDAV authentication holder. + * + * @param webDavAuthenticationHolder The WebDAV authentication holder to set + */ + public void setWebDavAuthenticationHolder(final WebDavAuthenticationHolder webDavAuthenticationHolder) { + this.webDavAuthenticationHolder = webDavAuthenticationHolder; + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/aws/AwsS3ClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/aws/AwsS3ClientTest.java new file mode 100644 index 00000000..b07a84ff --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/aws/AwsS3ClientTest.java @@ -0,0 +1,78 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.aws; + +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for AwsS3Client. + * + * @author shinsuke + */ +public class AwsS3ClientTest extends PlainTestCase { + + public AwsS3Client awsS3Client; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + awsS3Client = new AwsS3Client(); + awsS3Client.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + awsS3Client.close(); + super.tearDown(); + } + + public void test_parsePath() { + String[] result = awsS3Client.parsePath("my-bucket/path/to/object.txt"); + assertEquals("my-bucket", result[0]); + assertEquals("path/to/object.txt", result[1]); + + result = awsS3Client.parsePath("my-bucket/object.txt"); + assertEquals("my-bucket", result[0]); + assertEquals("object.txt", result[1]); + + result = awsS3Client.parsePath("my-bucket"); + assertEquals("my-bucket", result[0]); + assertEquals("", result[1]); + } + + public void test_parsePath_invalid() { + try { + awsS3Client.parsePath(""); + fail("Should throw CrawlingAccessException for empty path"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid path")); + } + } + + public void test_normalizeUri() { + assertEquals("s3://my-bucket/object.txt", awsS3Client.normalizeUri("s3://my-bucket/object.txt")); + assertEquals("s3://my-bucket/object.txt", awsS3Client.normalizeUri("my-bucket/object.txt")); + } + + public void test_charsetGetterSetter() { + awsS3Client.setCharset("UTF-16"); + assertEquals("UTF-16", awsS3Client.getCharset()); + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/azure/AzureBlobClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/azure/AzureBlobClientTest.java new file mode 100644 index 00000000..0e9606eb --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/azure/AzureBlobClientTest.java @@ -0,0 +1,78 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.azure; + +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for AzureBlobClient. + * + * @author shinsuke + */ +public class AzureBlobClientTest extends PlainTestCase { + + public AzureBlobClient azureBlobClient; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + azureBlobClient = new AzureBlobClient(); + azureBlobClient.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + azureBlobClient.close(); + super.tearDown(); + } + + public void test_parsePath() { + String[] result = azureBlobClient.parsePath("my-container/path/to/blob.txt"); + assertEquals("my-container", result[0]); + assertEquals("path/to/blob.txt", result[1]); + + result = azureBlobClient.parsePath("my-container/blob.txt"); + assertEquals("my-container", result[0]); + assertEquals("blob.txt", result[1]); + + result = azureBlobClient.parsePath("my-container"); + assertEquals("my-container", result[0]); + assertEquals("", result[1]); + } + + public void test_parsePath_invalid() { + try { + azureBlobClient.parsePath(""); + fail("Should throw CrawlingAccessException for empty path"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid path")); + } + } + + public void test_normalizeUri() { + assertEquals("azure://my-container/blob.txt", azureBlobClient.normalizeUri("azure://my-container/blob.txt")); + assertEquals("azure://my-container/blob.txt", azureBlobClient.normalizeUri("my-container/blob.txt")); + } + + public void test_charsetGetterSetter() { + azureBlobClient.setCharset("UTF-16"); + assertEquals("UTF-16", azureBlobClient.getCharset()); + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClientTest.java new file mode 100644 index 00000000..0251943f --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/gcp/GoogleCloudStorageClientTest.java @@ -0,0 +1,78 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.gcp; + +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for GoogleCloudStorageClient. + * + * @author shinsuke + */ +public class GoogleCloudStorageClientTest extends PlainTestCase { + + public GoogleCloudStorageClient gcsClient; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + gcsClient = new GoogleCloudStorageClient(); + gcsClient.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + gcsClient.close(); + super.tearDown(); + } + + public void test_parsePath() { + String[] result = gcsClient.parsePath("my-bucket/path/to/object.txt"); + assertEquals("my-bucket", result[0]); + assertEquals("path/to/object.txt", result[1]); + + result = gcsClient.parsePath("my-bucket/object.txt"); + assertEquals("my-bucket", result[0]); + assertEquals("object.txt", result[1]); + + result = gcsClient.parsePath("my-bucket"); + assertEquals("my-bucket", result[0]); + assertEquals("", result[1]); + } + + public void test_parsePath_invalid() { + try { + gcsClient.parsePath(""); + fail("Should throw CrawlingAccessException for empty path"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid path")); + } + } + + public void test_normalizeUri() { + assertEquals("gs://my-bucket/object.txt", gcsClient.normalizeUri("gs://my-bucket/object.txt")); + assertEquals("gs://my-bucket/object.txt", gcsClient.normalizeUri("my-bucket/object.txt")); + } + + public void test_charsetGetterSetter() { + gcsClient.setCharset("UTF-16"); + assertEquals("UTF-16", gcsClient.getCharset()); + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/git/GitClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/git/GitClientTest.java new file mode 100644 index 00000000..a92f9a23 --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/git/GitClientTest.java @@ -0,0 +1,144 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.git; + +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for GitClient. + * + * @author shinsuke + */ +public class GitClientTest extends PlainTestCase { + + public GitClient gitClient; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + gitClient = new GitClient(); + gitClient.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + gitClient.close(); + super.tearDown(); + } + + public void test_init() { + gitClient.init(); + assertNotNull(gitClient); + assertNotNull(gitClient.getLocalRepoDir()); + } + + public void test_GitInfo() { + GitClient.GitInfo info = new GitClient.GitInfo("https://github.com/user/repo.git", "master", "src/Main.java"); + assertEquals("https://github.com/user/repo.git", info.getRepositoryUrl()); + assertEquals("master", info.getBranch()); + assertEquals("src/Main.java", info.getPath()); + assertEquals("Main.java", info.getFilename()); + assertNotNull(info.getRepositoryName()); + } + + public void test_parseGitUri() { + gitClient.init(); + + GitClient.GitInfo info = gitClient.parseGitUri("git://https://github.com/user/repo.git/master/src/Main.java"); + assertEquals("https://github.com/user/repo.git", info.getRepositoryUrl()); + assertEquals("master", info.getBranch()); + assertEquals("src/Main.java", info.getPath()); + + info = gitClient.parseGitUri("git://https://github.com/user/repo.git/develop"); + assertEquals("https://github.com/user/repo.git", info.getRepositoryUrl()); + assertEquals("develop", info.getBranch()); + assertEquals("", info.getPath()); + } + + public void test_parseGitUri_invalid() { + gitClient.init(); + + try { + gitClient.parseGitUri("http://example.com/file.txt"); + fail("Should throw CrawlingAccessException for invalid scheme"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid Git URI")); + } + + try { + gitClient.parseGitUri("git://invalid"); + fail("Should throw CrawlingAccessException for invalid format"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid Git URI format")); + } + } + + public void test_GitAuthentication() { + final GitAuthentication auth = new GitAuthentication(); + auth.setServer("https://github\\.com/.*"); + auth.setUsername("testuser"); + auth.setPassword("testpass"); + + assertEquals("testuser", auth.getUsername()); + assertEquals("testpass", auth.getPassword()); + assertNotNull(auth.getServerPattern()); + } + + public void test_GitAuthenticationHolder() { + final GitAuthenticationHolder holder = new GitAuthenticationHolder(); + + final GitAuthentication auth1 = new GitAuthentication(); + auth1.setServer("https://github\\.com/.*"); + auth1.setUsername("user1"); + + final GitAuthentication auth2 = new GitAuthentication(); + auth2.setServer("https://gitlab\\.com/.*"); + auth2.setUsername("user2"); + + holder.add(auth1); + holder.add(auth2); + + GitAuthentication found = holder.get("https://github.com/user/repo.git"); + assertNotNull(found); + assertEquals("user1", found.getUsername()); + + found = holder.get("https://gitlab.com/user/repo.git"); + assertNotNull(found); + assertEquals("user2", found.getUsername()); + + found = holder.get("https://unknown.com/user/repo.git"); + assertNull(found); + } + + public void test_charsetGetterSetter() { + gitClient.setCharset("UTF-16"); + assertEquals("UTF-16", gitClient.getCharset()); + } + + public void test_getMimeType() { + gitClient.init(); + + assertEquals("text/plain", gitClient.getMimeType("file.txt")); + assertEquals("text/x-java-source", gitClient.getMimeType("Main.java")); + assertEquals("application/xml", gitClient.getMimeType("config.xml")); + assertEquals("application/json", gitClient.getMimeType("data.json")); + assertEquals("application/octet-stream", gitClient.getMimeType("file.bin")); + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/sftp/SftpClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/sftp/SftpClientTest.java new file mode 100644 index 00000000..d086f0b4 --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/sftp/SftpClientTest.java @@ -0,0 +1,157 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.sftp; + +import java.util.HashMap; +import java.util.Map; + +import org.codelibs.fess.crawler.Constants; +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for SftpClient. + * + * @author shinsuke + */ +public class SftpClientTest extends PlainTestCase { + + public SftpClient sftpClient; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + sftpClient = new SftpClient(); + sftpClient.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + sftpClient.close(); + super.tearDown(); + } + + public void test_init() { + final Map params = new HashMap<>(); + params.put("connectTimeout", 5000); + params.put("strictHostKeyChecking", "yes"); + + sftpClient.setInitParameterMap(params); + sftpClient.init(); + + assertEquals(5000, sftpClient.getConnectTimeout()); + assertEquals("yes", sftpClient.getStrictHostKeyChecking()); + } + + public void test_SftpInfo() { + // Test basic URL parsing + SftpClient.SftpInfo info = new SftpClient.SftpInfo("sftp://example.com/path/to/file.txt", Constants.UTF_8); + assertEquals("example.com", info.getHost()); + assertEquals(22, info.getPort()); + assertEquals("/path/to/file.txt", info.getPath()); + assertEquals("file.txt", info.getFilename()); + + // Test with custom port + info = new SftpClient.SftpInfo("sftp://example.com:2222/path/to/file.txt", Constants.UTF_8); + assertEquals("example.com", info.getHost()); + assertEquals(2222, info.getPort()); + + // Test root directory + info = new SftpClient.SftpInfo("sftp://example.com/", Constants.UTF_8); + assertEquals("/", info.getPath()); + assertEquals("", info.getFilename()); + + // Test child URL generation + info = new SftpClient.SftpInfo("sftp://example.com/path/to", Constants.UTF_8); + String childUrl = info.toChildUrl("file.txt"); + assertTrue(childUrl.contains("file.txt")); + } + + public void test_SftpInfo_invalidScheme() { + try { + new SftpClient.SftpInfo("http://example.com/file.txt", Constants.UTF_8); + fail("Should throw CrawlingAccessException for invalid scheme"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("Invalid scheme")); + } + } + + public void test_SftpInfo_blankUrl() { + try { + new SftpClient.SftpInfo("", Constants.UTF_8); + fail("Should throw CrawlingAccessException for blank URL"); + } catch (final CrawlingAccessException e) { + assertTrue(e.getMessage().contains("blank")); + } + } + + public void test_SftpAuthentication() { + final SftpAuthentication auth = new SftpAuthentication(); + auth.setServer("sftp://example\\.com/.*"); + auth.setPort(2222); + auth.setUsername("testuser"); + auth.setPassword("testpass"); + + assertEquals(2222, auth.getPort()); + assertEquals("testuser", auth.getUsername()); + assertEquals("testpass", auth.getPassword()); + assertNotNull(auth.getServerPattern()); + } + + public void test_SftpAuthenticationHolder() { + final SftpAuthenticationHolder holder = new SftpAuthenticationHolder(); + + final SftpAuthentication auth1 = new SftpAuthentication(); + auth1.setServer("sftp://example\\.com/.*"); + auth1.setUsername("user1"); + + final SftpAuthentication auth2 = new SftpAuthentication(); + auth2.setServer("sftp://test\\.com/.*"); + auth2.setUsername("user2"); + + holder.add(auth1); + holder.add(auth2); + + SftpAuthentication found = holder.get("sftp://example.com/path/file.txt"); + assertNotNull(found); + assertEquals("user1", found.getUsername()); + + found = holder.get("sftp://test.com/path/file.txt"); + assertNotNull(found); + assertEquals("user2", found.getUsername()); + + found = holder.get("sftp://unknown.com/path/file.txt"); + assertNull(found); + } + + public void test_charsetGetterSetter() { + sftpClient.setCharset("UTF-16"); + assertEquals("UTF-16", sftpClient.getCharset()); + } + + public void test_connectTimeoutGetterSetter() { + sftpClient.setConnectTimeout(15000); + assertEquals(15000, sftpClient.getConnectTimeout()); + } + + public void test_strictHostKeyCheckingGetterSetter() { + sftpClient.setStrictHostKeyChecking("yes"); + assertEquals("yes", sftpClient.getStrictHostKeyChecking()); + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/webdav/WebDavClientTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/webdav/WebDavClientTest.java new file mode 100644 index 00000000..5284bdc0 --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/client/webdav/WebDavClientTest.java @@ -0,0 +1,104 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.client.webdav; + +import java.util.HashMap; +import java.util.Map; + +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * Test class for WebDavClient. + * + * @author shinsuke + */ +public class WebDavClientTest extends PlainTestCase { + + public WebDavClient webDavClient; + + @Override + protected void setUp() throws Exception { + super.setUp(); + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("mimeTypeHelper", org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl.class); + webDavClient = new WebDavClient(); + webDavClient.crawlerContainer = container; + } + + @Override + protected void tearDown() throws Exception { + webDavClient.close(); + super.tearDown(); + } + + public void test_init() { + final Map params = new HashMap<>(); + webDavClient.setInitParameterMap(params); + webDavClient.init(); + + assertNotNull(webDavClient); + } + + public void test_WebDavAuthentication() { + final WebDavAuthentication auth = new WebDavAuthentication(); + auth.setServer("http://example\\.com/webdav/.*"); + auth.setUsername("testuser"); + auth.setPassword("testpass"); + + assertEquals("testuser", auth.getUsername()); + assertEquals("testpass", auth.getPassword()); + assertNotNull(auth.getServerPattern()); + } + + public void test_WebDavAuthenticationHolder() { + final WebDavAuthenticationHolder holder = new WebDavAuthenticationHolder(); + + final WebDavAuthentication auth1 = new WebDavAuthentication(); + auth1.setServer("http://example\\.com/webdav/.*"); + auth1.setUsername("user1"); + + final WebDavAuthentication auth2 = new WebDavAuthentication(); + auth2.setServer("http://test\\.com/webdav/.*"); + auth2.setUsername("user2"); + + holder.add(auth1); + holder.add(auth2); + + WebDavAuthentication found = holder.get("http://example.com/webdav/files/file.txt"); + assertNotNull(found); + assertEquals("user1", found.getUsername()); + + found = holder.get("http://test.com/webdav/files/file.txt"); + assertNotNull(found); + assertEquals("user2", found.getUsername()); + + found = holder.get("http://unknown.com/webdav/files/file.txt"); + assertNull(found); + } + + public void test_charsetGetterSetter() { + webDavClient.setCharset("UTF-16"); + assertEquals("UTF-16", webDavClient.getCharset()); + } + + public void test_getFileName() { + assertEquals("file.txt", webDavClient.getFileName("http://example.com/path/to/file.txt")); + assertEquals("file.txt", webDavClient.getFileName("/path/to/file.txt")); + assertEquals("", webDavClient.getFileName("http://example.com/path/to/")); + assertEquals("", webDavClient.getFileName(null)); + } +}