// Copyright (c) 2009-2010 Intel Corporation
// All rights reserved.
// 
// WARRANTY DISCLAIMER
// 
// THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
// MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 
// Intel Corporation is the author of the Materials, and requests that all
// problem reports or change requests be submitted to it directly


//Blake: This code has been turned into a key/value map, by storing key in 
//the upper 32 bits of a long and value in lower 32 bits

inline long4 encode(int4 key,uint4 value){
	return upsample(key,value);
}
inline uint4 decodeValue(long4 kv){
	return (uint4)(kv.x,kv.y,kv.z,kv.w);	
}
inline int4 decodeKey(long4 kv){
	return (int4)((int)(kv.x>>32),(int)(kv.y>>32),(int)(kv.z>>32),(int)(kv.w>>32));
}
__kernel void BitonicSort(__global int4* theArray,__global uint4* theValues,
						 const uint stage,
						 const uint passOfStage,
						 const uint dir)
{
	uint i = get_global_id(0);
	long4 srcLeft, srcRight;
	long4 mask;
	long4 imask10 = (long4)(0,  0, -1, -1);
	long4 imask11 = (long4)(0, -1,  0, -1);
	long4 kv;
	if(stage > 0)
	{
		if(passOfStage > 0)	//upper level pass, exchange between two fours
		{
			ulong r = 1 << (passOfStage - 1);
			ulong lmask = r - 1;
			ulong left = ((i>>(passOfStage-1)) << passOfStage) + (i & lmask);
			ulong right = left + r;
			
			srcLeft =  encode(theArray[left],theValues[left]);
			srcRight = encode(theArray[right],theValues[right]);
			mask = srcLeft < srcRight;
			
			long4 imin = (srcLeft & mask) | (srcRight & ~mask);
			long4 imax = (srcLeft & ~mask) | (srcRight & mask);
			
			if( ((i>>(stage-1)) & 1) ^ dir )
			{
				theArray[left]  = decodeKey(imin);
				theArray[right] = decodeKey(imax);
				
				theValues[left]  = decodeValue(imin);
				theValues[right] = decodeValue(imax);
			}
			else
			{
				theArray[right] = decodeKey(imin);
				theArray[left]  = decodeKey(imax);
				
				theValues[right] = decodeValue(imin);
				theValues[left]  = decodeValue(imax);
			}
		}
		else	//last pass, sort inside one four
		{
			srcLeft = encode(theArray[i],theValues[i]);
			srcRight = srcLeft.zwxy;
			mask = (srcLeft < srcRight) ^ imask10;

			if(((i >> stage) & 1) ^ dir)
			{
				srcLeft = (srcLeft & mask) | (srcRight & ~mask);
				srcRight = srcLeft.yxwz;
				mask = (srcLeft < srcRight) ^ imask11;
				
				kv=(srcLeft & mask) | (srcRight & ~mask);
				theArray[i] = decodeKey(kv);
				theValues[i] = decodeValue(kv);
			}
			else
			{
				srcLeft = (srcLeft & ~mask) | (srcRight & mask);
				srcRight = srcLeft.yxwz;
				mask = (srcLeft < srcRight) ^ imask11;
				kv= (srcLeft & ~mask) | (srcRight & mask);
				theArray[i] = decodeKey(kv);
				theValues[i] = decodeValue(kv);
			}
		}
	}
	else	//first stage, sort inside one four
	{
		long4 imask0 = (long4)(0, -1, -1,  0);
		srcLeft = encode(theArray[i],theValues[i]);
		srcRight = srcLeft.yxwz;
		mask = (srcLeft < srcRight) ^ imask0;
		if( dir )
			srcLeft = (srcLeft & mask) | (srcRight & ~mask);
		else
			srcLeft = (srcLeft & ~mask) | (srcRight & mask);

		srcRight = srcLeft.zwxy;
		mask = (srcLeft < srcRight) ^ imask10;

		if((i & 1) ^ dir)
		{
			srcLeft = (srcLeft & mask) | (srcRight & ~mask);
			srcRight = srcLeft.yxwz;
			mask = (srcLeft < srcRight) ^ imask11;
			kv = (srcLeft & mask) | (srcRight & ~mask);
			theArray[i] = decodeKey(kv);
			theValues[i] = decodeValue(kv);
		}
		else
		{
			srcLeft = (srcLeft & ~mask) | (srcRight & mask);
			srcRight = srcLeft.yxwz;
			mask = (srcLeft < srcRight) ^ imask11;
			kv = (srcLeft & ~mask) | (srcRight & mask);
			theArray[i] = decodeKey(kv);
			theValues[i] = decodeValue(kv);
		}
	}
}
__kernel void bitonicInitMap(
    __global uint *d_SrcKey,
    __global uint *d_SrcVal,uint defaultKey,uint defaultVal){
	uint id=get_global_id(0);
	d_SrcKey[id]=defaultKey;
	d_SrcVal[id]=defaultVal;
}