diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4f26eee70b88f..8ee9a676134ef 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -736,6 +736,11 @@ def index_ignore_pcms : Flag<["-"], "index-ignore-pcms">, Visibility<[ClangOption, CC1Option]>, HelpText<"Ignore symbols from imported pcm modules">, MarshallingInfoFlag>; +def index_store_record_compression + : Flag<["-"], "index-store-compress">, + Visibility<[ClangOption, CC1Option]>, + HelpText<"Whether to compress unit and record files in the index store">, + MarshallingInfoFlag>; // Make sure all other -ccc- options are rejected. def ccc_ : Joined<["-"], "ccc-">, Group, Flags<[Unsupported]>; diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h index f25d43d9ffb20..1314d4d38df01 100644 --- a/clang/include/clang/Frontend/FrontendOptions.h +++ b/clang/include/clang/Frontend/FrontendOptions.h @@ -477,6 +477,9 @@ class FrontendOptions { std::string IndexStorePath; std::string IndexUnitOutputPath; + /// Whether to compress the unit and record files in the index store. + bool IndexStoreCompress = false; + /// The input kind, either specified via -x argument or deduced from the input /// file name. InputKind DashX; diff --git a/clang/include/clang/Index/IndexRecordWriter.h b/clang/include/clang/Index/IndexRecordWriter.h index e8020aa6ad620..c4fc4684a04f9 100644 --- a/clang/include/clang/Index/IndexRecordWriter.h +++ b/clang/include/clang/Index/IndexRecordWriter.h @@ -53,10 +53,12 @@ typedef llvm::function_ref &Scratch)> /// beginRecord, and if the file does not already exist, then proceed to add /// all symbol occurrences (addOccurrence) and finally finish with endRecord. class IndexRecordWriter { + /// Whether to compress the index record using zlib. + bool Compress; SmallString<64> RecordsPath; ///< The records directory path. void *Record = nullptr; ///< The state of the current record. public: - IndexRecordWriter(StringRef IndexPath); + IndexRecordWriter(StringRef IndexPath, bool Compress); enum class Result { Success, diff --git a/clang/include/clang/Index/IndexUnitWriter.h b/clang/include/clang/Index/IndexUnitWriter.h index 1edc51ac2dd9d..ee7cf6cd044c3 100644 --- a/clang/include/clang/Index/IndexUnitWriter.h +++ b/clang/include/clang/Index/IndexUnitWriter.h @@ -50,6 +50,8 @@ class IndexUnitWriter { SmallString<64> UnitsPath; std::string ProviderIdentifier; std::string ProviderVersion; + /// Whether to compress the index unit using zlib. + bool Compress; std::string OutputFile; std::string ModuleName; OptionalFileEntryRef MainFile; @@ -92,17 +94,12 @@ class IndexUnitWriter { /// \param IsSystem true for system module units, false otherwise. /// \param Remapper Remapper to use to standardize file paths to make them /// hermetic/reproducible. This applies to all paths emitted in the unit file. - IndexUnitWriter(FileManager &FileMgr, - StringRef StorePath, + IndexUnitWriter(FileManager &FileMgr, StringRef StorePath, StringRef ProviderIdentifier, StringRef ProviderVersion, - StringRef OutputFile, - StringRef ModuleName, - OptionalFileEntryRef MainFile, - bool IsSystem, - bool IsModuleUnit, - bool IsDebugCompilation, - StringRef TargetTriple, - StringRef SysrootPath, + bool Compress, StringRef OutputFile, StringRef ModuleName, + OptionalFileEntryRef MainFile, bool IsSystem, + bool IsModuleUnit, bool IsDebugCompilation, + StringRef TargetTriple, StringRef SysrootPath, const PathRemapper &Remapper, writer::ModuleInfoWriterCallback GetInfoForModule); ~IndexUnitWriter(); diff --git a/clang/lib/Index/ClangIndexRecordWriter.cpp b/clang/lib/Index/ClangIndexRecordWriter.cpp index 0c3793bfb2284..24e574614b06e 100644 --- a/clang/lib/Index/ClangIndexRecordWriter.cpp +++ b/clang/lib/Index/ClangIndexRecordWriter.cpp @@ -61,9 +61,9 @@ StringRef ClangIndexRecordWriter::getUSRNonCached(const IdentifierInfo *Name, return StringRef(Ptr, USR.size()); } -ClangIndexRecordWriter::ClangIndexRecordWriter(ASTContext &Ctx, +ClangIndexRecordWriter::ClangIndexRecordWriter(ASTContext &Ctx, bool Compress, RecordingOptions Opts) - : Impl(Opts.DataDirPath), Ctx(Ctx), RecordOpts(std::move(Opts)) { + : Impl(Opts.DataDirPath, Compress), Ctx(Ctx), RecordOpts(std::move(Opts)) { if (Opts.RecordSymbolCodeGenName) ASTNameGen.reset(new ASTNameGenerator(Ctx)); } diff --git a/clang/lib/Index/ClangIndexRecordWriter.h b/clang/lib/Index/ClangIndexRecordWriter.h index eb8c66e1f2936..00b84bb245abb 100644 --- a/clang/lib/Index/ClangIndexRecordWriter.h +++ b/clang/lib/Index/ClangIndexRecordWriter.h @@ -35,7 +35,7 @@ class ClangIndexRecordWriter { llvm::DenseMap USRByDecl; public: - ClangIndexRecordWriter(ASTContext &Ctx, RecordingOptions Opts); + ClangIndexRecordWriter(ASTContext &Ctx, bool Compress, RecordingOptions Opts); ~ClangIndexRecordWriter(); ASTContext &getASTContext() { return Ctx; } diff --git a/clang/lib/Index/IndexRecordReader.cpp b/clang/lib/Index/IndexRecordReader.cpp index f8078becdded5..84e375bcee9c3 100644 --- a/clang/lib/Index/IndexRecordReader.cpp +++ b/clang/lib/Index/IndexRecordReader.cpp @@ -7,11 +7,12 @@ //===----------------------------------------------------------------------===// #include "clang/Index/IndexRecordReader.h" -#include "IndexDataStoreUtils.h" #include "BitstreamVisitor.h" +#include "IndexDataStoreUtils.h" #include "clang/Index/IndexDataStoreSymbolUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Bitstream/BitstreamReader.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" @@ -368,7 +369,42 @@ IndexRecordReader::createWithBuffer(std::unique_ptr Buffer, std::unique_ptr Reader; Reader.reset(new IndexRecordReader()); auto &Impl = Reader->Impl; - Impl.Buffer = std::move(Buffer); + if (Buffer->getBuffer().starts_with("CIDXR")) { + if (!llvm::compression::zlib::isAvailable()) { + Error = "zlib not available to decompress compressed index record"; + return nullptr; + } + + ArrayRef compressedBuffer = + llvm::arrayRefFromStringRef(Buffer->getBuffer()); + + // Slice off the `CIDXR` marker we checked above. + compressedBuffer = compressedBuffer.slice(5); + + // Read the uncompressed size of the record. + if (compressedBuffer.size() < 4) { + Error = "Unexpectedly found end of record file"; + return nullptr; + } + size_t uncompressedSize = + llvm::support::endian::read32le(compressedBuffer.data()); + compressedBuffer = compressedBuffer.slice(4); + + // Decompress the record + llvm::SmallVector decompressed; + llvm::Error decompressError = llvm::compression::zlib::decompress( + compressedBuffer, decompressed, uncompressedSize); + if (decompressError) { + llvm::raw_string_ostream ErrorOS(Error); + ErrorOS << "Failed to decompress index record: " << decompressError; + return nullptr; + } + Impl.Buffer = llvm::MemoryBuffer::getMemBufferCopy( + llvm::toStringRef(decompressed), + Buffer->getBufferIdentifier() + " decompressed"); + } else { + Impl.Buffer = std::move(Buffer); + } llvm::BitstreamCursor Stream(*Impl.Buffer); if (Stream.AtEndOfStream()) { diff --git a/clang/lib/Index/IndexRecordWriter.cpp b/clang/lib/Index/IndexRecordWriter.cpp index ec1ed09b95afd..83840c6ccc3d0 100644 --- a/clang/lib/Index/IndexRecordWriter.cpp +++ b/clang/lib/Index/IndexRecordWriter.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringSet.h" #include "llvm/Bitstream/BitstreamWriter.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -234,8 +235,8 @@ static void writeDecls(BitstreamWriter &Stream, ArrayRef Decls, Stream.ExitBlock(); } -IndexRecordWriter::IndexRecordWriter(StringRef IndexPath) - : RecordsPath(IndexPath) { +IndexRecordWriter::IndexRecordWriter(StringRef IndexPath, bool Compress) + : Compress(Compress), RecordsPath(IndexPath) { store::appendRecordSubDir(RecordsPath); } @@ -319,7 +320,44 @@ IndexRecordWriter::endRecord(std::string &Error, } raw_fd_ostream OS(TempFD, /*shouldClose=*/true); - OS.write(State.Buffer.data(), State.Buffer.size()); + if (Compress) { + if (!llvm::compression::zlib::isAvailable()) { + Error = "Zlib not available to compress record file"; + return Result::Failure; + } + + // Higher compression levels add marginal improvements to the compressed + // size while having a a measurable impact on compile time. An analysis on a + // mixed clang / Swift project showed the following results: + // - BestSpeed: Compresses the index store by 66% while increasing the + // index-while-building overhead by 15% (from 1.07% to 1.23%) + // - Default: Compression of 68.1%, increases index-while-building overhead + // by 23% + // - BestSize: Compression of 68.2%, increases index-while-building + // overhead by 37% + // Based on those numbers, BestSpeed seems like the best choice. If clients + // need to compress the index store further, they should run a compression + // algorithm across all files in the index store. + auto compressionLevel = compression::zlib::BestSpeedCompression; + ArrayRef bufferRef = llvm::arrayRefFromStringRef(State.Buffer); + llvm::SmallVector compressed; + llvm::compression::zlib::compress(bufferRef, compressed, compressionLevel); + + // Write the `CIDXR` (compressed index record) marker to indicate that this + // is a compressed record file. + OS << "CIDXR"; + + // Write the size of the uncompressed record so that we can allocate a + // buffer of the corresponding size when decompressing it. + char Buf[4]; + llvm::support::endian::write32le(Buf, bufferRef.size()); + OS.write(Buf, sizeof(Buf)); + + // Write the acutal compressed data + OS << llvm::toStringRef(compressed); + } else { + OS << State.Buffer; + } OS.close(); if (OS.has_error()) { diff --git a/clang/lib/Index/IndexUnitReader.cpp b/clang/lib/Index/IndexUnitReader.cpp index a455f1e4977e9..21563094f81c2 100644 --- a/clang/lib/Index/IndexUnitReader.cpp +++ b/clang/lib/Index/IndexUnitReader.cpp @@ -282,7 +282,42 @@ bool IndexUnitReaderImpl::init(std::unique_ptr Buf, sys::TimePoint<> ModTime, std::string &Error) { this->ModTime = ModTime; - this->MemBuf = std::move(Buf); + + if (Buf->getBuffer().starts_with("CIDXU")) { + if (!llvm::compression::zlib::isAvailable()) { + Error = "zlib not available to decompress compressed index unit"; + return true; + } + + ArrayRef compressedBuffer = llvm::arrayRefFromStringRef(Buf->getBuffer()); + + // Slice off the `CIDXU` marker we checked above. + compressedBuffer = compressedBuffer.slice(5); + + // Read the uncompressed size of the unit. + if (compressedBuffer.size() < 4) { + Error = "Unexpectedly found end of record unit"; + return true; + } + size_t uncompressedSize = + llvm::support::endian::read32le(compressedBuffer.data()); + compressedBuffer = compressedBuffer.slice(4); + + // Decompress the unit + llvm::SmallVector decompressed; + llvm::Error decompressError = llvm::compression::zlib::decompress( + compressedBuffer, decompressed, uncompressedSize); + if (decompressError) { + llvm::raw_string_ostream ErrorOS(Error); + ErrorOS << "Failed to decompress index unit: " << decompressError; + return true; + } + this->MemBuf = llvm::MemoryBuffer::getMemBufferCopy( + llvm::toStringRef(decompressed), + Buf->getBufferIdentifier() + " decompressed"); + } else { + this->MemBuf = std::move(Buf); + } llvm::BitstreamCursor Stream(*MemBuf); if (Stream.AtEndOfStream()) { diff --git a/clang/lib/Index/IndexUnitWriter.cpp b/clang/lib/Index/IndexUnitWriter.cpp index f5a1855b26b76..c2e41ec6fb9cd 100644 --- a/clang/lib/Index/IndexUnitWriter.cpp +++ b/clang/lib/Index/IndexUnitWriter.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/Bitstream/BitstreamWriter.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -116,21 +117,14 @@ class IndexUnitWriter::PathStorage { } }; -IndexUnitWriter::IndexUnitWriter(FileManager &FileMgr, - StringRef StorePath, - StringRef ProviderIdentifier, - StringRef ProviderVersion, - StringRef OutputFile, - StringRef ModuleName, - OptionalFileEntryRef MainFile, - bool IsSystem, - bool IsModuleUnit, - bool IsDebugCompilation, - StringRef TargetTriple, - StringRef SysrootPath, - const PathRemapper &Remapper, - writer::ModuleInfoWriterCallback GetInfoForModule) -: FileMgr(FileMgr), Remapper(Remapper) { +IndexUnitWriter::IndexUnitWriter( + FileManager &FileMgr, StringRef StorePath, StringRef ProviderIdentifier, + StringRef ProviderVersion, bool Compress, StringRef OutputFile, + StringRef ModuleName, OptionalFileEntryRef MainFile, bool IsSystem, + bool IsModuleUnit, bool IsDebugCompilation, StringRef TargetTriple, + StringRef SysrootPath, const PathRemapper &Remapper, + writer::ModuleInfoWriterCallback GetInfoForModule) + : FileMgr(FileMgr), Compress(Compress), Remapper(Remapper) { this->UnitsPath = StorePath; store::appendUnitSubDir(this->UnitsPath); this->ProviderIdentifier = std::string(ProviderIdentifier); @@ -393,7 +387,34 @@ bool IndexUnitWriter::write(std::string &Error) { } raw_fd_ostream OS(TempFD, /*shouldClose=*/true); - OS.write(Buffer.data(), Buffer.size()); + if (Compress) { + if (!llvm::compression::zlib::isAvailable()) { + Error = "Zlib not available to compress record file"; + return true; + } + + // See comment in `IndexRecordWriter::endRecord` for a rational why we use + // `BestSpeed`. + auto compressionLevel = compression::zlib::BestSpeedCompression; + ArrayRef bufferRef = llvm::arrayRefFromStringRef(Buffer); + llvm::SmallVector compressed; + llvm::compression::zlib::compress(bufferRef, compressed, compressionLevel); + + // Write the `CIDXU` (compressed index unit) marker to indicate that this + // is a compressed unit file. + OS << "CIDXU"; + + // Write the size of the uncompressed unit so that we can allocate a + // buffer of the corresponding size when decompressing it. + char Buf[4]; + llvm::support::endian::write32le(Buf, bufferRef.size()); + OS.write(Buf, sizeof(Buf)); + + // Write the acutal compressed data + OS << llvm::toStringRef(compressed); + } else { + OS << Buffer; + } OS.close(); if (OS.has_error()) { diff --git a/clang/lib/Index/IndexingAction.cpp b/clang/lib/Index/IndexingAction.cpp index 4d0d700c13df9..bb79e5a1faf1c 100644 --- a/clang/lib/Index/IndexingAction.cpp +++ b/clang/lib/Index/IndexingAction.cpp @@ -838,9 +838,10 @@ static void writeUnitData(const CompilerInstance &CI, Remapper.addMapping(It->first, It->second); IndexUnitWriter UnitWriter( - CI.getFileManager(), DataPath, "clang", getClangVersion(), OutputFile, - ModuleName, RootFile, IsSystemUnit, IsModuleUnit, IsDebugCompilation, - CI.getTargetOpts().Triple, SysrootPath, Remapper, getModuleInfo); + CI.getFileManager(), DataPath, "clang", getClangVersion(), + CI.getFrontendOpts().IndexStoreCompress, OutputFile, ModuleName, RootFile, + IsSystemUnit, IsModuleUnit, IsDebugCompilation, CI.getTargetOpts().Triple, + SysrootPath, Remapper, getModuleInfo); DepProvider.visitFileDependencies( CI, [&](FileEntryRef FE, bool isSystemFile) { @@ -863,7 +864,8 @@ static void writeUnitData(const CompilerInstance &CI, } }); - ClangIndexRecordWriter RecordWriter(CI.getASTContext(), RecordOpts); + ClangIndexRecordWriter RecordWriter( + CI.getASTContext(), CI.getFrontendOpts().IndexStoreCompress, RecordOpts); for (auto I = Recorder.record_begin(), E = Recorder.record_end(); I != E; ++I) { FileID FID = I->first; diff --git a/clang/test/Index/Store/compress-index-store.c b/clang/test/Index/Store/compress-index-store.c new file mode 100644 index 0000000000000..70d1a71d43dfa --- /dev/null +++ b/clang/test/Index/Store/compress-index-store.c @@ -0,0 +1,8 @@ +// RUN: rm -rf %t.idx +// RUN: %clang_cc1 %s -index-store-path %t.idx -index-store-compress +// RUN: c-index-test core -print-unit %t.idx | FileCheck --check-prefix=UNIT %s +// RUN: c-index-test core -print-record %t.idx | FileCheck --check-prefix=RECORD %s + +// UNIT: main-path: {{.*}}/compress-index-store.c +// RECORD: [[@LINE+1]]:6 | function/C | c:@F@foo | Decl | rel: 0 +void foo(int *p); diff --git a/clang/tools/c-index-test/JSONAggregation.cpp b/clang/tools/c-index-test/JSONAggregation.cpp index 6f97e70d706e4..fa6cc4d186c30 100644 --- a/clang/tools/c-index-test/JSONAggregation.cpp +++ b/clang/tools/c-index-test/JSONAggregation.cpp @@ -215,7 +215,8 @@ std::unique_ptr Aggregator::processRecord(StringRef recordFile) { std::string error; auto recordReader = IndexRecordReader(Store, recordFile, error); if (!recordReader) { - errs() << "failed reading record file: " << recordFile << '\n'; + errs() << "failed reading record file: " << recordFile << ": " << error + << '\n'; ::exit(1); } auto record = std::make_unique();