Skip to content

Commit

Permalink
feat: use Zstd for index file compression instead of zlib
Browse files Browse the repository at this point in the history
[autofix.ci] apply automated fixes

a
  • Loading branch information
shenlebantongying committed Mar 24, 2024
1 parent 2cea76f commit e0cb233
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 35 deletions.
1 change: 1 addition & 0 deletions CMake_Unix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET
vorbis # .ogg
vorbisfile
liblzma
libzstd
xapian-core
)

Expand Down
3 changes: 2 additions & 1 deletion goldendict.pro
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ UI_DIR = build
MOC_DIR = build
RCC_DIR = build
LIBS += -lbz2 \
-llzo2
-llzo2 \
-lzstd

win32{
Debug: LIBS+= -L$$PWD/winlibs/lib/dbg/ -lzlibd
Expand Down
32 changes: 19 additions & 13 deletions src/btreeidx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include "btreeidx.hh"
#include "folding.hh"
#include "utf8.hh"
#include <QRunnable>
#include <QThreadPool>

#include <QSemaphore>
#include <math.h>
#include <string.h>
Expand All @@ -14,7 +13,6 @@
#include "wstring_qt.hh"
#include "utils.hh"

#include <QRegularExpression>
#include "wildcard.hh"
#include "globalbroadcaster.hh"

Expand All @@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex():
idxFile( nullptr ),
rootNodeLoaded( false )
{
zstd_dctx.reset( ZSTD_createDCtx() );
}

BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ):
Expand Down Expand Up @@ -409,12 +408,12 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out )

idxFile->read( &compressedData.front(), compressedData.size() );

unsigned long decompressedLength = out.size();
const size_t size_or_err =
ZSTD_decompressDCtx( zstd_dctx.get(), out.data(), out.size(), compressedData.data(), compressedData.size() );

if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() )
!= Z_OK
|| decompressedLength != out.size() )
if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) {
throw exFailedToDecompressNode();
}
}

char const * BtreeIndex::findChainOffsetExactOrPrefix(
Expand Down Expand Up @@ -758,6 +757,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
size_t maxElements,
uint32_t & lastLeafLinkOffset )
{

std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx;
zstd_cctx.reset( ZSTD_createCCtx() );

// We compress all the node data. This buffer would hold it.
vector< unsigned char > uncompressedData;

Expand Down Expand Up @@ -846,21 +849,24 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
}

// Save the result.
vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );
vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) );

unsigned long compressedSize = compressedData.size();
const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
compressedData.data(),
compressedData.size(),
uncompressedData.data(),
uncompressedData.size() );

if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() )
!= Z_OK ) {
if ( ZSTD_isError( size_or_err ) ) {
qFatal( "Failed to compress btree node." );
abort();
}

uint32_t offset = file.tell();

file.write< uint32_t >( uncompressedData.size() );
file.write< uint32_t >( compressedSize );
file.write( &compressedData.front(), compressedSize );
file.write< uint32_t >( size_or_err );
file.write( &compressedData.front(), size_or_err );

if ( isLeaf ) {
// A link to the next leef, which is zero and which will be updated
Expand Down
12 changes: 9 additions & 3 deletions src/btreeidx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
#include <stdint.h>
#include <string>
#include <vector>
#include <memory>

#include <QFuture>
#include <QList>
#include <QSet>
#include <QVector>

#include "zstd_wrapper.hh"


/// A base for the dictionary which creates a btree index to look up
/// the words.
Expand All @@ -28,11 +29,13 @@ using gd::wstring;
using std::vector;
using std::map;



enum {
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 4
FormatVersion = 5
};

// These exceptions which might be thrown during the index traversal
Expand Down Expand Up @@ -139,6 +142,9 @@ protected:

protected:

std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx;

// Lifetime of 2 var below is not managed by this class.
QMutex * idxFileMutex;
File::Class * idxFile;

Expand Down
29 changes: 18 additions & 11 deletions src/chunkedstorage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#include "chunkedstorage.hh"
#include <zlib.h>
#include <zstd.h>
#include <string.h>

#include <QDataStream>
#include <QScopeGuard>
#include <QMutexLocker>
#include <QScopeGuard>

namespace ChunkedStorage {

Expand All @@ -19,6 +20,8 @@ Writer::Writer( File::Class & f ):
chunkStarted( false ),
bufferUsed( 0 )
{
zstd_cctx.reset( ZSTD_createCCtx() );

// Create a sratchpad at the beginning of file. We use it to write chunk
// table if it would fit, in order to save some seek times.

Expand Down Expand Up @@ -64,21 +67,22 @@ void Writer::addToBlock( void const * data, size_t size )

void Writer::saveCurrentChunk()
{
size_t maxCompressedSize = compressBound( bufferUsed );

if ( bufferCompressed.size() < maxCompressedSize )
if ( size_t maxCompressedSize = ZSTD_compressBound( bufferUsed ); bufferCompressed.size() < maxCompressedSize )
bufferCompressed.resize( maxCompressedSize );

unsigned long compressedSize = bufferCompressed.size();
const size_t size_or_err =
ZSTD_compress2( zstd_cctx.get(), bufferCompressed.data(), bufferCompressed.size(), buffer.data(), bufferUsed );

if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK )
if ( ZSTD_isError( size_or_err ) ) {
throw exFailedToCompressChunk();
}

offsets.push_back( file.tell() );

file.write( (uint32_t)bufferUsed );
file.write( (uint32_t)compressedSize );
file.write( &bufferCompressed.front(), compressedSize );
file.write( (uint32_t)size_or_err );
file.write( bufferCompressed.data(), size_or_err );

bufferUsed = 0;

Expand Down Expand Up @@ -118,6 +122,8 @@ uint32_t Writer::finish()
Reader::Reader( File::Class & f, uint32_t offset ):
file( f )
{
zstd_dctx.reset( ZSTD_createDCtx() );

file.seek( offset );

uint32_t size = file.read< uint32_t >();
Expand Down Expand Up @@ -163,10 +169,11 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk )
} );
Q_UNUSED( autoUnmap )

unsigned long decompressedLength = chunk.size();

if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK
|| decompressedLength != chunk.size() ) {
size_t const size_or_err =
ZSTD_decompressDCtx( zstd_dctx.get(), chunk.data(), chunk.size(), chunkDataBytes, compressedSize );

if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) {
throw exFailedToDecompressChunk();
}
}
Expand Down
9 changes: 8 additions & 1 deletion src/chunkedstorage.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "file.hh"

#include <vector>
#include <stdint.h>
#include <memory>

#include "zstd_wrapper.hh"

/// A chunked compression storage. We use this for articles' bodies. The idea
/// is to store data in a separately-compressed chunks, much like in dictzip,
Expand Down Expand Up @@ -66,6 +68,8 @@ private:
size_t bufferUsed;

void saveCurrentChunk();

std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx;
};

/// This class reads data blocks previously written by Writer.
Expand All @@ -83,6 +87,9 @@ public:
/// Uses the user-provided storage to load the entire chunk, and then to
/// return a pointer to the requested block inside it.
char * getBlock( uint32_t address, vector< char > & );

private:
std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx;
};

} // namespace ChunkedStorage
Expand Down
13 changes: 7 additions & 6 deletions src/dict/mdx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1270,12 +1270,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f

string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId;
int t = 20;
mdxbench:
int t = 20;
mdxbench:
if ( true ) {
// Building the index

// gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() );
// gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() );
auto index_begin = std::chrono::high_resolution_clock::now();

MdictParser parser;
Expand Down Expand Up @@ -1435,9 +1435,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );

auto index_finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> c = index_finish - index_begin;
qDebug() << c;
auto index_finish = std::chrono::high_resolution_clock::now();
std::chrono::duration< double, std::milli > c = index_finish - index_begin;
qDebug() << c.count() << "ms";
//or qDebug() << c; for qt6.6

t = t - 1;
if ( t > 0 ) {
Expand Down
20 changes: 20 additions & 0 deletions src/zstd_wrapper.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <zstd.h>

namespace ZSTD {

struct deleter
{
void operator()( ZSTD_DCtx * Ctx ) const
{
ZSTD_freeDCtx( Ctx );
}

void operator()( ZSTD_CCtx * Ctx ) const
{
ZSTD_freeCCtx( Ctx );
}
};

} // namespace ZSTD

0 comments on commit e0cb233

Please sign in to comment.