#include <algorithm>
#include <cctype>
#include <fstream>
#include <iostream>
#include <list>
#include <vector>
//-------------------------------------------------------------------
char const * const inFilename = "xor2.txt";
//-------------------------------------------------------------------
namespace is_good_char_details
{
unsigned char const polskie_iso_8859_2 [18] = { 0xa1,0xc6,0xca,0xa3,0xd1,0xd3,0xa6,0xac,0xaf,0xb1,0xe6,0xea,0xb3,0xf1,0xf3,0xb6,0xbc,0xbf, };
unsigned char const polskie_win_1250 [18] = { 0xa5,0xc6,0xca,0xa3,0xd1,0xd3,0x8c,0x8f,0xaf,0xb9,0xe6,0xea,0xb3,0xf1,0xf3,0x9c,0x9f,0xbf, };
unsigned char const polskie_utf8_hi [ 2] = { 0xc4, 0xc5 };
unsigned char const polskie_utf8_lo [18] = { 0x84,0x86,0x98,0x81,0x83,0x93,0x9a,0xb9,0xbb,0x85,0x87,0x99,0x82,0x84,0xb3,0x9b,0xba,0xbc, };
unsigned char const polskie_utf16_hi [ 2] = { 0x00, 0x01 };
unsigned char const polskie_utf16_lo [18] = { 0x04,0x06,0x18,0x41,0x43,0xd3,0x5a,0x79,0x7b,0x05,0x07,0x19,0x42,0x44,0xf3,0x5b,0x7a,0x7c, };
unsigned char goodchar[256];
void init()
{
// assume nothing:
// any reasonable charset is possible
for(int i=0;i<256;i++) goodchar[i]= isprint(i) || isspace(i);
for(int i=0;i<18;i++) goodchar[polskie_win_1250[i]]=1;
for(int i=0;i<18;i++) goodchar[polskie_iso_8859_2[i]]=1;
for(int i=0;i<18;i++) goodchar[polskie_utf8_lo[i]]=1;
for(int i=0;i<2;i++) goodchar[polskie_utf8_hi[i]]=1;
for(int i=0;i<18;i++) goodchar[polskie_utf16_lo[i]]=1;
for(int i=0;i<2;i++) goodchar[polskie_utf16_hi[i]]=1;
}
}
bool is_good_char(unsigned char c) { return is_good_char_details::goodchar[c]; }
//-------------------------------------------------------------------
typedef std::basic_ifstream<unsigned char> u_ifstream;
typedef std::vector<unsigned char> byteary;
typedef byteary::const_iterator byteary_cit;
typedef std::pair<size_t, size_t> range;
typedef std::list<range> ranges;
typedef std::list<unsigned char> bytelist;
typedef bytelist::const_iterator bytelist_cit;
typedef std::vector<bytelist> bytelists;
typedef bytelists::const_iterator bytelists_cit;
//-------------------------------------------------------------------
// checks whever all bytes included in range could possibly be a characters
// END must satisfy conventional lastpos+1 assumption
bool checkAllGood(byteary_cit begin, byteary_cit end)
{
while(begin != end)
if(!is_good_char(*begin++))
return false;
return true;
}
// checks if the list of possible bytes at each key's position does
// actually form at least one possible key
bool checkKeyPossible(bytelists const & key)
{
for(bytelists_cit it = key.begin(); it != key.end(); ++it)
if(!it->size())
// if for ANY key index there is NO possible bytes
// - no such key can exist
return false;
return true;
}
// returns set of l-r both inclusive ranges of the input
// that does not contain any 0x0D0A sequences
// END must satisfy conventional lastpos+1 assumption
ranges findSafeRanges(byteary_cit begin, byteary_cit end)
{
ranges result;
byteary_cit range_start = begin;
for(byteary_cit it = begin; it != end && it+1 != end; ++it)
if(*it == 0x0d && *(it+1) == 0x0a)
{
result.push_back(std::make_pair(range_start - begin, (it - begin) - 1)); // -1 makes the range right-inclusive
it += 2; // skip the sequence
range_start = it; // next range may start after the 0x0D0A pair just been found
--it; // dirty fix to account the autoincrement from the loop
}
if(range_start != end) // was there an open range at the end of the input?
result.push_back(std::make_pair(range_start - begin, end - begin));
return result;
}
// removes from given set of byte ranges all those,
// which length is smaller than the specified
void trimSmallRanges(ranges & rngs, size_t minlength)
{
ranges::const_iterator it = rngs.begin();
while(it != rngs.end())
if(it->second - it->first < minlength)
rngs.erase(it++);
else
++it;
}
// for a range of encrypted input bytes, assuming some length of the encrypted key,
// test every byte of the input and check what could be the value of corresponding key byte
// (what key-value could had been used to generate such encoded-byte)
// returns a set (keylength long) of lists of values, each single list in set for each key position
// resulting lists are sorted in ascending order
bytelists findKeyPossb(byteary_cit i_begin, range const & range, size_t keyLength)
{
// how many blocks of length equal to the keyLength would fit into
// the given range?
const size_t blockCount = (range.second - range.first + 1) / keyLength;
// mind, that this function is NOT optimized in ANY way and it ONLY does scan whole-fitting subblocks
// any remaining bytes after the last whole subblock are not checked for the sake of simplicity
bytelists possibleKeyBytes(keyLength); // prepare keyLength lists
for(size_t keyIdx=0; keyIdx < keyLength; ++keyIdx) // for each key position == equal to in-subblock position
for(int possibleByte = 0; possibleByte < 256 /*byte max*/; ++possibleByte) // for each possible 'value'
{
bool thisByteSeemsOk = true;
unsigned char psbByte = (unsigned char)possibleByte;
for(size_t blockNumber = 0; thisByteSeemsOk && blockNumber < blockCount; ++blockNumber) // for each sub-block
{
size_t idx = keyIdx + blockNumber*keyLength;
unsigned char c = *(i_begin+idx) ^ psbByte; // extract input byte, try decrypting with 'value'
bool test = !!is_good_char(c); // if such key 'value' for this position and in this block
thisByteSeemsOk &= test; // is INVALID - does NOT produce a valid clear text, FAIL
}
if(thisByteSeemsOk) // if this key-value has NOT FAILED for ANY blocks, store it as possible key value!
possibleKeyBytes[keyIdx].push_back(psbByte);
}
return possibleKeyBytes;
}
// assuming the lists elements are ascdending, produces intersection of the two lists;
// resulting list consists only of values contained in both given lists
bytelist intersect(bytelist const & target, bytelist const & mask)
{
bytelist result;
bytelist_cit it1 = target.begin(), it2 = mask.begin();
while(it1 != target.end() && it2 != mask.end())
{
unsigned char v1 = *it1, v2 = *it2;
if(v1 > v2) ++it2;
else if(v1==v2){result.push_back(v1);++it1;++it2;}
else ++it1;
}
return result;
}
// takes two sets of key's possible values and intersects them, keyindex-vise;
// produces a set that will contain only possibilities common to both the left and right sets;
// for each index of the key, the result is list of common possible values at that index;
// function assumes that the SECOND set of possibilities has its indexes positively and circularily offsetted modulo keylength;
// that is,
// for left set {ABCDE}, right set {01234} and offset 0, the function will generate: {A&0, B&1, C&2, D&3, E&4}
// for left set {ABCDE}, right set {01234} and offset 2, the function will generate: {A&2, B&3, C&4, D&0, E&1}
// where A/B/C/D/E/0/1/2/3/4 denote some bytelist contained in the sets
// where & denotes a bytelist intersection
// slightly optimized: if it detects that any intersection produces empty result, stops and returns an empty response
// example:
// left set: {[3,6], [1,2], [5,4]}, right set: {[4,6], [3], [1,7]}, offset: 1 --> result: {[3], [1], [4]}
// left set: {[3,6], [1,2], [5,7]}, right set: {[4,6], [3], [1,7]}, offset: 1 --> result: {}
bytelists crossPossbKeys(bytelists const & keyA, bytelists const & keyB, size_t crossoffset)
{
bytelists result;
for(size_t idx = 0; idx < keyA.size(); ++idx)
{
bytelist cross = intersect(keyA[idx], keyB[(idx + crossoffset) % keyB.size()]);
if(!cross.size()) { result.clear(); break; }// detect and mark failures early
result.push_back(cross);
}
if(!result.size())
result.push_back(bytelist());
return result;
}
// playground
int main()
{
using namespace std;
is_good_char_details::init(); // assumes nothing: any reasonable (printable, polish) charset is possible
byteary fileBytes;
// read the input (encrypted) file
{
u_ifstream f(inFilename, ifstream::ate | ifstream::binary);
fileBytes.resize(f.tellg());
f.seekg(0);
f.read( &fileBytes[0], fileBytes.size());
}
// determine parts of the input that are safe for analysis
ranges rngs = findSafeRanges(fileBytes.begin(), fileBytes.end());
// remove those shorter than a doubled key length, as they are harder to analyze
// actually the file is quite long, so maybe even three or four lengths;
// assume the key was 256 bytes long
trimSmallRanges(rngs, 256*3);
// FOR THE XOR2.TXT, only 6 ranges are longer-equal to that threshold
ranges::const_iterator it = rngs.begin();
bytelists a = findKeyPossb(fileBytes.begin(), *it++, 256); // now, playing with those ranges A-F,
bytelists b = findKeyPossb(fileBytes.begin(), *it++, 256); // calculate what could have been
bytelists c = findKeyPossb(fileBytes.begin(), *it++, 256); // the key values used to generate them
bytelists d = findKeyPossb(fileBytes.begin(), *it++, 256); // from the cleartext (characterized by 'is_good_char')
bytelists e = findKeyPossb(fileBytes.begin(), *it++, 256); // encrypted by a simple XOR (with key of 256 bytes)
bytelists f = findKeyPossb(fileBytes.begin(), *it++, 256); //
if(!checkKeyPossible(a) || !checkKeyPossible(b) || !checkKeyPossible(c) || !checkKeyPossible(d) || !checkKeyPossible(e) || !checkKeyPossible(f))
{
// if ANY of the blocks happened to be non-generatible
cout << "FAIL" << endl; // oh well.. not a XOR? wrong key length? wrong charset in is_good_char?
return -1;
}
// FOR XOR2.TXT
// it happens that:
{
bool ab = a == b; // true
bool ac = a == c; // true
bool ad = a == d; // FALSE
bool ae = a == e; // FALSE
bool af = a == f; // true
bool de = d == e; // TRUE
}
// ranges A,B,C,F and D,E generate completely the same key possibilities
// thats a surprise, not even an offset between them?
// anyways, no point in intersecting possibilities that are equal
// from the original 6, we are left with 2
// remaining two might be offsetted in respect to each other
// check what offsets are possible;
// that is, assuming which offsets, an intersection of these possibilities
// still leaves some valid key possibility
for(size_t off = 0; off<256; ++off)
if(checkKeyPossible(crossPossbKeys(a, d, off)))
cout << off << endl;
// and it turns out that the only possible offset is .. NO offset again
// At this point I began to wonder, whever there is some subtle error
// in the analysis. I find it quite hard to believe that all those 6 ranges
// are perfectly aligned with each other. Unless the crypter was written/bugged
// to do it that way, it is very improbable.
// ok, going further:
bytelists x = crossPossbKeys(a, d, 0x00 /*the only possible offset between them*/);
{
bool xa = x == a; // false
bool xd = x == d; // true
}
// and it turns out that the possibilities of range A are a superset of those from D
// so, possibilities from A may be dropped out of analysis
// at this point you may for example write out the D, etc
// but I don't see any point in that - it is QUITE WIDE for ALL KEY INDEXES anyways.
int wtf = 8; wtf /= (wtf - 8); // a breakpoint! :)
// - END --------------------------------------------------------------------------------------
// for the result to be actualy usable, you would have to use some additional knowledge
// about the cleartext or the cipher or the key. for example, you could trim the is_good_char
// to accept fewer characters as valid. you could dig through more ranges that were found.
// mind that the original INPUT FILE contained MANY MORE RANGES that were 'easily' analyzable,
// but I have IGNORED all those smaller than 3*keylength completely intentionally.
// AT LEAST ALL THOSE >=1xKEYLENGHT ARE USABLE in a similar ways.
// I have NOT checked them, because I THINK that they are short, and will produce so WIDE key
// possibilities, that it will not invalidate any further possible keys.
// I may be wrong, weren't I so lazy, I'd check check them against all possible cross-offsets.
// also, the code is severely underoptimized. I tried to keep it straightforward both because
// I'm lazy, and because I wanted to show the very basics. It was meant to be an introductory
// example, not a NSA-grade distributed breaker:) I think it should be written all the way
// differently, but it was written as a test answering "is it really simple XOR?",
// so it is AS IS.
// have fun, if any
// q.
cin.sync();
cin.get();
}