1616 * limitations under the License.
1717 */
1818
19- #include " orc/Int128.hh"
19+ #include " ColumnReader.hh"
20+
21+ #include < cmath>
2022
2123#include " Adaptor.hh"
2224#include " ByteRLE.hh"
23- #include " ColumnReader.hh"
2425#include " ConvertColumnReader.hh"
26+ #include " DictionaryLoader.hh"
2527#include " RLE.hh"
2628#include " SchemaEvolution.hh"
2729#include " orc/Exceptions.hh"
30+ #include " orc/Int128.hh"
2831
2932#include < math.h>
3033#include < iostream>
@@ -36,19 +39,6 @@ namespace orc {
3639 // PASS
3740 }
3841
39- inline RleVersion convertRleVersion (proto::ColumnEncoding_Kind kind) {
40- switch (static_cast <int64_t >(kind)) {
41- case proto::ColumnEncoding_Kind_DIRECT:
42- case proto::ColumnEncoding_Kind_DICTIONARY:
43- return RleVersion_1;
44- case proto::ColumnEncoding_Kind_DIRECT_V2:
45- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
46- return RleVersion_2;
47- default :
48- throw ParseError (" Unknown encoding in convertRleVersion" );
49- }
50- }
51-
5242 ColumnReader::ColumnReader (const Type& type, StripeStreams& stripe)
5343 : columnId(type.getColumnId()),
5444 memoryPool (stripe.getMemoryPool()),
@@ -519,7 +509,10 @@ namespace orc {
519509 std::unique_ptr<RleDecoder> rle_;
520510
521511 public:
522- StringDictionaryColumnReader (const Type& type, StripeStreams& stipe);
512+ StringDictionaryColumnReader (const Type& type, StripeStreams& stripe);
513+
514+ StringDictionaryColumnReader (const Type& type, StripeStreams& stripe,
515+ const std::shared_ptr<StringDictionary> dictionary);
523516 ~StringDictionaryColumnReader () override ;
524517
525518 uint64_t skip (uint64_t numValues) override ;
@@ -533,39 +526,23 @@ namespace orc {
533526
534527 StringDictionaryColumnReader::StringDictionaryColumnReader (const Type& type,
535528 StripeStreams& stripe)
536- : ColumnReader(type, stripe), dictionary_(new StringDictionary(stripe.getMemoryPool())) {
529+ : StringDictionaryColumnReader(type, stripe, nullptr ) {}
530+
531+ StringDictionaryColumnReader::StringDictionaryColumnReader (
532+ const Type& type, StripeStreams& stripe, const std::shared_ptr<StringDictionary> dictionary)
533+ : ColumnReader(type, stripe), dictionary_(dictionary) {
537534 RleVersion rleVersion = convertRleVersion (stripe.getEncoding (columnId).kind ());
538- uint32_t dictSize = stripe.getEncoding (columnId).dictionary_size ();
539535 std::unique_ptr<SeekableInputStream> stream =
540536 stripe.getStream (columnId, proto::Stream_Kind_DATA, true );
541537 if (stream == nullptr ) {
542538 throw ParseError (" DATA stream not found in StringDictionaryColumn" );
543539 }
544540 rle_ = createRleDecoder (std::move (stream), false , rleVersion, memoryPool, metrics);
545- stream = stripe.getStream (columnId, proto::Stream_Kind_LENGTH, false );
546- if (dictSize > 0 && stream == nullptr ) {
547- throw ParseError (" LENGTH stream not found in StringDictionaryColumn" );
548- }
549- std::unique_ptr<RleDecoder> lengthDecoder =
550- createRleDecoder (std::move (stream), false , rleVersion, memoryPool, metrics);
551- dictionary_->dictionaryOffset .resize (dictSize + 1 );
552- int64_t * lengthArray = dictionary_->dictionaryOffset .data ();
553- lengthDecoder->next (lengthArray + 1 , dictSize, nullptr );
554- lengthArray[0 ] = 0 ;
555- for (uint32_t i = 1 ; i < dictSize + 1 ; ++i) {
556- if (lengthArray[i] < 0 ) {
557- throw ParseError (" Negative dictionary entry length" );
558- }
559- lengthArray[i] += lengthArray[i - 1 ];
560- }
561- int64_t blobSize = lengthArray[dictSize];
562- dictionary_->dictionaryBlob .resize (static_cast <uint64_t >(blobSize));
563- std::unique_ptr<SeekableInputStream> blobStream =
564- stripe.getStream (columnId, proto::Stream_Kind_DICTIONARY_DATA, false );
565- if (blobSize > 0 && blobStream == nullptr ) {
566- throw ParseError (" DICTIONARY_DATA stream not found in StringDictionaryColumn" );
541+
542+ // If no dictionary was provided, load it
543+ if (!dictionary_) {
544+ dictionary_ = loadStringDictionary (columnId, stripe, memoryPool);
567545 }
568- readFully (dictionary_->dictionaryBlob .data (), blobSize, blobStream.get ());
569546 }
570547
571548 StringDictionaryColumnReader::~StringDictionaryColumnReader () {
@@ -1717,8 +1694,15 @@ namespace orc {
17171694 case GEOGRAPHY:
17181695 switch (static_cast <int64_t >(stripe.getEncoding (type.getColumnId ()).kind ())) {
17191696 case proto::ColumnEncoding_Kind_DICTIONARY:
1720- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
1721- return std::make_unique<StringDictionaryColumnReader>(type, stripe);
1697+ case proto::ColumnEncoding_Kind_DICTIONARY_V2: {
1698+ // Check if we have a pre-loaded dictionary we can use
1699+ auto dictionary = stripe.getSharedDictionary (type.getColumnId ());
1700+ if (dictionary) {
1701+ return std::make_unique<StringDictionaryColumnReader>(type, stripe, dictionary);
1702+ } else {
1703+ return std::unique_ptr<ColumnReader>(new StringDictionaryColumnReader (type, stripe));
1704+ }
1705+ }
17221706 case proto::ColumnEncoding_Kind_DIRECT:
17231707 case proto::ColumnEncoding_Kind_DIRECT_V2:
17241708 return std::make_unique<StringDirectColumnReader>(type, stripe);
0 commit comments