// testHashClash.cpp // tool for HDiff // An estimation method for detecting hash clashs /* The MIT License (MIT) Copyright (c) 2012-2019 HouSisong Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include "../libHDiffPatch/HDiff/private_diff/limit_mem_diff/adler_roll.h" #include "../_clock_for_demo.h" #define _IS_NEED_ZLIB 1 #define _IS_NEED_MD5 0 #if (_IS_NEED_ZLIB) #include "zlib.h" #endif #if (_IS_NEED_MD5) #include "md5.h" // https://sourceforge.net/projects/libmd5-rfc #endif typedef unsigned char TByte; typedef ptrdiff_t TInt; typedef size_t TUInt; #define _IS_USES_MY_RAND #ifdef _IS_USES_MY_RAND class CMyRand { public: unsigned int _my_holdrand; public: inline CMyRand() :_my_holdrand(1) {} inline int _my_rand() { unsigned int result = _my_holdrand * 214013 + 2531011; _my_holdrand = result; return (result >> 16) & RAND_MAX; } }; static CMyRand _MyRand; inline int _rand() { return _MyRand._my_rand(); } inline void _srand(unsigned int seed) { _MyRand._my_holdrand = seed; } #else #define _rand rand #define _srand srand #endif /* //interface struct THash{ typename TValue; static const char* name() const; void hash_begin(); void hash(const TByte* pdata,const TByte* pdataEnd); void hash_finish(TValue* hv); };*/ #if (_IS_NEED_MD5) struct THash_md5_128{ typedef std::pair TValue; inline static const char* name() { return "md5_128"; } md5_state_t _hv; inline void hash_begin() { md5_init(&_hv); } inline void hash(const TByte* pdata,const TByte* pdata_end) { md5_append(&_hv,pdata,(int)(pdata_end-pdata)); } inline void hash_end(TValue* hv) { md5_finish(&_hv,(TByte*)hv); } }; namespace std{ template<> struct hash{ inline size_t operator()(const THash_md5_128::TValue& v) const{ return v.first^v.second; } }; template<> struct less{ inline bool operator()(const THash_md5_128::TValue& x,const THash_md5_128::TValue& y)const{ return (x.first^x.second) < (y.first^y.second); } }; } #endif #if (_IS_NEED_ZLIB) struct THash_crc32{ typedef uint32_t TValue; inline static const char* name() { return "crc32"; } TValue _hv; inline void hash_begin() { _hv=(TValue)crc32(0,0,0); } inline void hash(const TByte* pdata,const TByte* pdata_end) { _hv=(TValue)crc32(_hv,pdata,(uInt)(pdata_end-pdata)); } inline void hash_end(TValue* hv) { *hv=_hv; } }; struct THash_adler32{ typedef uint32_t TValue; inline static const char* name() { return "adler32"; } TValue _hv; inline void hash_begin() { _hv=(TValue)adler32(0,0,0); } inline void hash(const TByte* pdata,const TByte* pdata_end) { _hv=(TValue)adler32(_hv,pdata,(uInt)(pdata_end-pdata)); } inline void hash_end(TValue* hv) { *hv=_hv; } }; #endif struct THash_adler32h{ typedef uint32_t TValue; inline static const char* name() { return "adler32h"; } TValue _hv; inline void hash_begin() { _hv=adler32_start(0,0); } inline void hash(const TByte* pdata,const TByte* pdata_end) { _hv=adler32_append(_hv,pdata,(pdata_end-pdata)); //assert(_hv==(TValue)adler32(_hv,pdata,(pdata_end-pdata))); } inline void hash_end(TValue* hv) { *hv=_hv; } }; template struct THash_adler32h_bit{ typedef uint32_t TValue; inline static const char* name() { return "adler32h_bit"; } TValue _hv; inline void hash_begin() { _hv=adler32_start(0,0); } inline void hash(const TByte* pdata,const TByte* pdata_end) { _hv=adler32_append(_hv,pdata,(pdata_end-pdata)); } inline void hash_end(TValue* hv) { *hv=_hv&((1< void test(const TByte* data,const TByte* data_end){ typedef typename THash::TValue TValue; typedef std::pair TPair; typedef std::unordered_map TMap; double time0=clock_s(); const size_t clip_count=sizeof(TValue)/sizeof(TUInt); assert(clip_count*sizeof(TUInt)==sizeof(TValue)); //unsupport other bit TMap maps[clip_count]; for (size_t m=0;m>32),(int)kTestMask); printf("%s%s ",THash::name(),std::string(12-strlen(THash::name()),' ').c_str()); uint64_t curClashMin=0; uint64_t clashs[clip_count]={0}; double clashBases[clip_count]={0}; size_t i=0; while (curClashMinsecond; const TByte* vf=v.first; if ((pv_end-pv)!=(v.second-vf)){ ++clash; clashBase+=map.size(); ++i; } else if (pv==vf){ //same i }else{ bool isEq=true; for (size_t e=0; e0) printf("["); for (size_t m=0;m0) printf(" "); if (clashR>0){ //printf("%.3e(%.1fbit)",clashR,log2(1/clashR)); printf("%.2fbit",log2(1/clashR)); }else{ printf("0/%.3e",clashBases[m]); } } if (clip_count>0) printf("]"); printf(" \ttime:%.1fs\n",(clock_s()-time0)); } #if (_IS_NEED_FAST_ADLER128) template void test_fadler128(const TByte* data,const TByte* data_end){ typedef THash_fadler128 THash; typedef uint64_t TUInt; typedef typename THash::TValue TValue; typedef std::pair TPair; typedef std::unordered_map TMap; double time0=clock_s(); const size_t clip_count=1; TMap maps[clip_count]; for (size_t m=0;m>32),(int)kTestMask1); printf("%08X %08X] ",(int)(kTestMask0>>32),(int)kTestMask0); printf("%s%s ",THash::name(),std::string(10-strlen(THash::name()),' ').c_str()); uint64_t curClashMin=0; uint64_t clashs[clip_count]={0}; double clashBases[clip_count]={0}; size_t i=0; while (curClashMinsecond; const TByte* vf=v.first; if ((pv_end-pv)!=(v.second-vf)){ ++clash; clashBase+=map.size(); ++i; } else if (pv==vf){ //same i }else{ bool isEq=true; for (size_t e=0; e0) printf("["); for (size_t m=0;m0) printf(" "); if (clashR>0){ //printf("%.3e(%.1fbit)",clashR,log2(1/clashR)); printf("%.2fbit",log2(1/clashR)); }else{ printf("0/%.3e",clashBases[m]); } } if (clip_count>0) printf("]"); printf(" \ttime:%.1fs\n",(clock_s()-time0)); } #endif int main() { double bestCR_32bit =1.0/(((uint64_t)1)<<32); double bestCR_64bit =bestCR_32bit*bestCR_32bit; double bestCR_128bit=bestCR_64bit*bestCR_64bit; printf("32bit hash best clash rate: %.3e (1/%llu) \n", bestCR_32bit,(((uint64_t)1)<<32)); printf("48bit hash best clash rate: %.3e (1/%llu) \n", 1.0/(((uint64_t)1)<<48),(((uint64_t)1)<<48)); printf("64bit hash best clash rate: %.3e (1/%llu%llu) \n", bestCR_64bit,(((uint64_t)(~(uint64_t)0)))/10,(((uint64_t)(~(uint64_t)0)))%10+1); printf("128bit hash best clash rate: %.3e (1/%.3e) \n\n", bestCR_128bit,1/bestCR_128bit); std::vector data(kRandTestMaxSize); unsigned int rand_seed=0; _srand(rand_seed); for (size_t i=0; i,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); test,uint32_t>(data.data(),data.data()+data.size()); return 0; //*/ //* kMinClash=100000; test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); #if (_IS_NEED_ZLIB) test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); #endif #if (_IS_NEED_MD5) test(data.data(),data.data()+data.size()); #endif test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); //test(data.data(),data.data()+data.size()); #if (_IS_NEED_FAST_ADLER128) test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); kMinClash=50; test(data.data(),data.data()+data.size()); #endif printf("\n"); //*/ //* printf("NOTE: test fadler64 32bit ...\n"); kMinClash=100000; test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); test(data.data(),data.data()+data.size()); printf("\n"); //*/ /* #if (_IS_NEED_FAST_ADLER128) printf("NOTE: not enough time to get next test results ...\n"); kMinClash=1; test_fadler128(data.data(),data.data()+data.size()); test_fadler128(data.data(),data.data()+data.size()); test_fadler128(data.data(),data.data()+data.size()); test_fadler128(data.data(),data.data()+data.size()); printf("\n"); #endif //*/ //* printf("NOTE: not enough time to get next test results ...\n"); kMinClash=1; // for timesaving but increase deviation test(data.data(),data.data()+data.size()); #if (_IS_NEED_MD5) test(data.data(),data.data()+data.size()); #endif //*/ return 0; }