//suffix_string.cpp // /* The MIT License (MIT) Copyright (c) 2012-2017 HouSisong Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "suffix_string.h" #include #include //memset #include //std::runtime_error #include "../../../libParallel/parallel_import.h" #if (_IS_USED_MULTITHREAD) #include //if used vc++, need >= vc2012 #endif //排序方法选择. #ifndef _SA_SORTBY #define _SA_SORTBY //# define _SA_SORTBY_STD_SORT //# define _SA_SORTBY_SAIS # define _SA_SORTBY_DIVSUFSORT #endif//_SA_SORTBY //匹配查找方法选择,是否用std::lower_bound,否则用自定义的实现. //#define _SA_MATCHBY_STD_LOWER_BOUND #if (defined _SA_SORTBY_STD_SORT) || (defined _SA_MATCHBY_STD_LOWER_BOUND) #include //sort,lower_bound #endif #ifdef _SA_SORTBY_SAIS #include "sais.hxx" #endif #ifdef _SA_SORTBY_DIVSUFSORT #include "libdivsufsort/divsufsort.h" #include "libdivsufsort/divsufsort64.h" #endif namespace hdiff_private{ template static void _clearVector(std::vector& v){ std::vector _tmp; v.swap(_tmp); } namespace { typedef TSuffixString::TInt TInt; typedef TSuffixString::TInt32 TInt32; typedef TSuffixString::TChar TChar; static bool getStringIsLess(const TChar* str0,const TChar* str0End, const TChar* str1,const TChar* str1End){ TInt L0=(TInt)(str0End-str0); TInt L1=(TInt)(str1End-str1); #ifdef _SA_SORTBY_STD_SORT const int kMaxCmpLength_sort=1024*4; //警告:这是一个特殊的处理手段,用以避免_suffixString_create在使用std::sort时 // 某些情况退化到O(n*n)复杂度(运行时间无法接受),设置最大比较长度从而控制算法在 // O(kMaxCmpLength_sort*n)复杂度以内,这时排序的结果并不是标准的后缀数组; if (L0>kMaxCmpLength_sort) L0=kMaxCmpLength_sort; if (L1>kMaxCmpLength_sort) L1=kMaxCmpLength_sort; #endif TInt LMin; if (L0 static void _suffixString_create(const TChar* src,const TChar* src_end, std::vector& out_sstring,size_t threadNum){ size_t size=(size_t)(src_end-src); if (size<0) throw std::runtime_error("suffixString_create() error."); out_sstring.resize(size); if (size<=0) return; #ifdef _SA_SORTBY_STD_SORT for (TSAInt i=0;i(&out_sstring[0],&out_sstring[0]+size, TSuffixString_compare(src,src_end)); } catch (...) { rt=-1; } #endif #ifdef _SA_SORTBY_SAIS TSAInt rt=saisxx(src,&out_sstring[0],size); #endif #ifdef _SA_SORTBY_DIVSUFSORT saint_t rt=-1; if (sizeof(TSAInt)==8) rt=divsufsort64(src,(saidx64_t*)&out_sstring[0],(saidx64_t)size,(int)threadNum); else if (sizeof(TSAInt)==4) rt=divsufsort(src,(saidx32_t*)&out_sstring[0],(saidx32_t)size,(int)threadNum); #endif if (rt!=0) throw std::runtime_error("suffixString_create() error."); } #ifdef _SA_MATCHBY_STD_LOWER_BOUND #else template inline static T* __select_mid(T*& p,size_t n) { return p+(n>>1); } //'&' for hack cpu cache speed for xcode, somebody know way? #endif template inline static const T* _lower_bound(const T* rbegin,const T* rend, const TChar* str,const TChar* str_end, const TChar* src_begin,const TChar* src_end, size_t min_eq=0){ #ifdef _SA_MATCHBY_STD_LOWER_BOUND return std::lower_bound (rbegin,rend,StringToken(str,str_end),TSuffixString_compare(src_begin,src_end)); #else size_t left_eq=min_eq; size_t right_eq=min_eq; while (size_t len=(size_t)(rend-rbegin)) { const T* mid=__select_mid(rbegin,len); size_t eq_len=(left_eq<=right_eq)?left_eq:right_eq; const TChar* vs=str+eq_len; const TChar* ss=src_begin+(*mid)+eq_len; bool is_less; while (true) { if (vs==str_end) { is_less=false; break; }; if (ss==src_end) { is_less=true; break; }; TInt sub=(*ss)-(*vs); if (!sub) { ++vs; ++ss; ++eq_len; const int kMaxCmpLength_forLimitRangeDiff=1024*8; if (eq_len static void _build_range256(const T* SA_begin,const T* SA_end, const TChar* src_begin,const TChar* src_end, const T** range){ TChar str[1]; const T* pos=SA_begin; for (size_t c=0;c<256;++c){ str[0]=(TChar)(c); pos=_lower_bound(pos,SA_end,str,str+1,src_begin,src_end); range[c]=pos; } range[256]=SA_end; } template static void _build_range(const T* SA_begin,const T* SA_end, const TChar* src_begin,const TChar* src_end, T* range){ TChar str[2]; str[0]=0; str[1]=0; const T* pos=SA_begin; for (size_t cc=0;cc<256*256;++cc){ //cc is [c0,c1] str[0]=(TChar)(cc>>8); str[1]=(TChar)(cc&255); pos=_lower_bound(pos,SA_end,str,str+2,src_begin,src_end); range[cc]=(T)(pos-SA_begin); } range[256*256]=(T)(SA_end-SA_begin); } }//end namespace TSuffixString::TSuffixString(bool isUsedFastMatch) :m_src_begin(0),m_src_end(0),m_isUsedFastMatch(isUsedFastMatch),m_cached2char_range(0){ clear_cache(); } TSuffixString::TSuffixString(const TChar* src_begin,const TChar* src_end,bool isUsedFastMatch,size_t threadNum) :m_src_begin(0),m_src_end(0),m_isUsedFastMatch(isUsedFastMatch),m_cached2char_range(0){ clear_cache(); resetSuffixString(src_begin,src_end,threadNum); } TSuffixString::~TSuffixString(){ clear(); } void TSuffixString::clear(){ clear_cache(); m_src_begin=0; m_src_end=0; _clearVector(m_SA_limit); _clearVector(m_SA_large); } void TSuffixString::resetSuffixString(const TChar* src_begin,const TChar* src_end,size_t threadNum){ assert(src_begin<=src_end); m_src_begin=src_begin; m_src_end=src_end; if (isUseLargeSA()){ _clearVector(m_SA_limit); _suffixString_create(m_src_begin,m_src_end,m_SA_large,threadNum); }else{ assert(sizeof(TInt32)==4); _clearVector(m_SA_large); _suffixString_create(m_src_begin,m_src_end,m_SA_limit,threadNum); } build_cache(threadNum); } TInt TSuffixString::lower_bound(const TChar* str,const TChar* str_end)const{ //not use any cached range table //return m_lower_bound(m_cached_SA_begin,m_cached_SA_end, // str,str_end,m_src_begin,m_src_end,m_cached_SA_begin,0); #if (_SSTRING_FAST_MATCH>0) if (m_isUsedFastMatch&&(!m_fastMatch.isHit(TFastMatchForSString::getHash(str)))) return -1; #define kMinStrLen _SSTRING_FAST_MATCH #else //assert(str_end-str>=2); #define kMinStrLen 2 #endif if ((kMinStrLen>=2)&(m_cached2char_range!=0)){ size_t cc=((size_t)str[1]) | (((size_t)str[0])<<8); size_t r0,r1; if (isUseLargeSA()){ r0=((TInt*)m_cached2char_range)[cc]*sizeof(TInt); r1=((TInt*)m_cached2char_range)[cc+1]*sizeof(TInt); }else{ r0=((TInt32*)m_cached2char_range)[cc]*sizeof(TInt32); r1=((TInt32*)m_cached2char_range)[cc+1]*sizeof(TInt32); } return m_lower_bound((TChar*)m_cached_SA_begin+r0,(TChar*)m_cached_SA_begin+r1, str,str_end,m_src_begin,m_src_end,m_cached_SA_begin,2); }else if (kMinStrLen>0){ size_t c=str[0]; return m_lower_bound(m_cached1char_range[c],m_cached1char_range[c+1], str,str_end,m_src_begin,m_src_end,m_cached_SA_begin,1); }else{ return -1; } } void TSuffixString::clear_cache(){ #if (_SSTRING_FAST_MATCH>0) if (m_isUsedFastMatch) m_fastMatch.clear(); #endif if (m_cached2char_range){ delete [](TChar*)m_cached2char_range; m_cached2char_range=0; } memset(&m_cached1char_range[0],0,sizeof(void*)*(256+1)); m_cached_SA_begin=0; m_cached_SA_end=0; m_lower_bound=(t_lower_bound_func)_lower_bound_TInt32;//safe } void TSuffixString::build_cache(size_t threadNum){ clear_cache(); #if (_SSTRING_FAST_MATCH>0) if (m_isUsedFastMatch) m_fastMatch.buildMatchCache(m_src_begin,m_src_end,threadNum); #endif const size_t kUsedCacheMinSASize =2*(1<<20); //当字符串较大时再启用大缓存表. if (SASize()>kUsedCacheMinSASize){ m_cached2char_range=new TChar[(256*256+1)*(isUseLargeSA()?sizeof(size_t):sizeof(TInt32))]; } if (isUseLargeSA()){ m_lower_bound=(t_lower_bound_func)_lower_bound_TInt; if (m_SA_large.empty()) return; m_cached_SA_begin=&m_SA_large[0]; m_cached_SA_end=&m_SA_large[0]+m_SA_large.size(); _build_range256((TInt*)m_cached_SA_begin,(TInt*)m_cached_SA_end, m_src_begin,m_src_end,(const TInt**)&m_cached1char_range[0]); if (m_cached2char_range){ _build_range((TInt*)m_cached_SA_begin,(TInt*)m_cached_SA_end, m_src_begin,m_src_end,(TInt*)m_cached2char_range); } }else{ m_lower_bound=(t_lower_bound_func)_lower_bound_TInt32; if (m_SA_limit.empty()) return; m_cached_SA_begin=&m_SA_limit[0]; m_cached_SA_end=&m_SA_limit[0]+m_SA_limit.size(); _build_range256((TInt32*)m_cached_SA_begin,(TInt32*)m_cached_SA_end, m_src_begin,m_src_end,(const TInt32**)&m_cached1char_range[0]); if (m_cached2char_range){ _build_range((TInt32*)m_cached_SA_begin,(TInt32*)m_cached_SA_end, m_src_begin,m_src_end,(TInt32*)m_cached2char_range); } } } #if (_SSTRING_FAST_MATCH>0) template static void _filter_insert(TBloomFilter* filter, const TChar* src_begin,const TChar* src_end){ const TChar* cur = src_begin; TFastMatchForSString::THash h=TFastMatchForSString::getHash(cur); cur+=TFastMatchForSString::kFMMinStrSize; do { #if (_IS_USED_MULTITHREAD) if (isMT) filter->insert_MT(h); else #endif filter->insert(h); if (cur=kFMMinStrSize){ const size_t rollSize=srcSize-(kFMMinStrSize-1); bf.init(rollSize,kFMZoom); //alloc large memory #if (_IS_USED_MULTITHREAD) const size_t kInsertMinParallelSize=4096; if ((threadNum>1)&&(rollSize>=kInsertMinParallelSize)) { const size_t maxThreanNum=rollSize/(kInsertMinParallelSize/2); threadNum=(threadNum<=maxThreanNum)?threadNum:maxThreanNum; const size_t step=rollSize/threadNum; const size_t threadCount=threadNum-1; std::vector threads(threadCount); for (size_t i=0;i,&bf,src_begin,src_begin+step+(kFMMinStrSize-1)); _filter_insert(&bf,src_begin,src_end); for (size_t i=0;i(&bf,src_begin,src_end); } }else if ((srcSize>0)||(src_begin!=0)) bf.init(0,kFMZoom); else{ bf.clear(); } } #endif }//namespace hdiff_private