Horse Sort
Basic Design
This sort works by trying to find runs cases where the numbers are already in order. It then combines the runs in a systematic way.
It is a legitimate sort in that is has a complexity of N log N. It does not require any special setup or configuration it is simple faster if the list has a contiguous presorted area, or areas.
Each run of numbers is called a "horse" and is automatically combined when it is about one half the size of the next horse to make it efficient.
If the list of fully randomized or disordered it is about 4 times the speed of quick sort. This sort has a special verification ability. If the list is already sorted or near sorted it performs the sort extremely fast. It takes advantage of patterns that already exist in the data. In practice it is very common for the programmer to make small changes to a list and re-sort it.
This concept is intended for cases where the list is only going to have minor changes and then the sort is being used to verify its order.
This sort has similar advantages to Insertion Sort but is much faster in large data sets.
Observed Results
This data was created by running may different runs on the same computer.
In order to test the Horse Sorts verification ability I had to create a list that had some sorted elements and some elements that where out of order. To create the custom lists I randomized a sorted list and then resorted the first part of the list for each run.
The following debug messages help explain the process the sort uses:
The vector size was: 10
Presort:
8 6 0 9 5 7 4 1 2 3
New Horse Added from: 0 to 2
pre merge
position: 0 0 6 8
after merge
position: 0 0 6 8
New Horse Added from: 3 to 4
pre merge
position: 0 0 6 8
position: 3 5 9
after merge
position: 0 0 5 6 8 9
New Horse Added from: 5 to 7
pre merge
position: 0 0 5 6 8 9
position: 5 1 4 7
after merge
position: 0 0 1 4 5 6 7 8 9
New Horse Added from: 8 to 9
pre merge
position: 0 0 1 4 5 6 7 8 9
position: 8 2 3
combining everything
position: 0 0 1 2 3 4 5 6 7 8 9
Postsort:
0 1 2 3 4 5 6 7 8 9
Conclusion
This sort is only ideal in cases were it is known that parts of the list are presorted.
Note that the algorithm is approximately order N log N. It has considerable overhead but it degrades gently from the ideal case.
Implementation
All the heavy lifting is actually done by the "inplace_merge" This process seems to consume most of the run time.
This is a C++ implementation of the Horse Sort
This was written by Karl Kendall
Copyright (C) 2021 Karl Kendall
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#include <iostream>
#include <string>
#include <list>
#include <vector>
#include <algorithm>
#include <set>
#include <fstream>
#include <windows.h> //needed for timer
bool m_debug = false; // should be off by default;
//const int MYSIZE = 3000000;
//const int SMALL_BATCH = 30000;
const int MYSIZE = 3000000;
const int SMALL_BATCH = 30000;
using namespace std;
class Timer
{
LARGE_INTEGER time_begin , time_end;
double frequency;
public:
Timer()
{
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
frequency = (double)freq.QuadPart;
}
void start()
{
QueryPerformanceCounter(&time_begin);
}
double stop()
{
QueryPerformanceCounter(&time_end);
return ((time_end.QuadPart - time_begin.QuadPart)/frequency);
}
operator double()
{
return ((time_end.QuadPart - time_begin.QuadPart)/frequency);
}
};
// full
//Default rand is not providing enough random bits
//This spreads out the entropy so I get 24 random bits.
int full()
{
int full = 0;
for (int i = 0 ; i < 3; i++)
{
full = full << 8;
int random = rand();
full = full | (rand() & 255);
}
return full;
}
// myrandom
// simple function needed for random_shuffle to work
// - noticed that random shuffle was not working when trying to debug a failure in a million item vector
int myrandom (int i) { return full()%i;}
class Horse
{
public:
int start;
int end;
int level;
//int size;
Horse(int Start, int End) :
start(Start), end(End)
{
if (m_debug)
{
cout << "New Horse Added from: " << Start << " to " << End << endl;
}
level = 1;
}
};
void housekeeping(bool finished, std::list<Horse> & corral, std::vector<int> & myVector)
{
if(m_debug)
{
std::list<Horse>::iterator fit;
fit = corral.begin();
cout << "start housekeeping\n";
while( fit!=corral.end() )
{
cout << "level:" << fit->level << '\t' << fit->start << /*' ' << fit->end << */ '\t';
for(int i = fit->start; i <= fit->end ; i++)
cout << myVector[i] << ' ';
cout << endl;
++fit;
}
}
if(finished)
{
if(m_debug)
{
cout << "combining everything" << endl;
}
std::list<Horse>::reverse_iterator first, second;
second = corral.rbegin();
first = second++;
while(second != corral.rend() )
{
inplace_merge (myVector.begin() + second->start , myVector.begin() + first->start , myVector.begin() + first->end + 1);
second->end = first->end;
second->level++;
corral.pop_back();
second = corral.rbegin();
if(second != corral.rend())
{
first = second++;
}
}
}
else
{
std::list<Horse>::reverse_iterator it;
it = corral.rbegin();
std::list<Horse>::reverse_iterator first, second;
first = it++;
second = it;
while(it != corral.rend() && (first->end - first->start) >= (second->end - second->start)/2 )
{
inplace_merge (myVector.begin() + second->start , myVector.begin() + first->start , myVector.begin() + first->end + 1);
second->end = first->end;
second->level++;
corral.pop_back();
it = corral.rbegin();
if(it != corral.rend())
{
first = it++;
second = it;
}
}
}
if(m_debug)
{
std::list<Horse>::iterator fit;
fit = corral.begin();
cout << "end housekeeping\n";
while( fit!=corral.end() )
{
cout << "level:" << fit->level << '\t' << fit->start /* <</* ' ' << fit->end */<< '\t';
for(int i = fit->start; i <= fit->end ; i++)
cout << myVector[i] << ' ';
cout << endl;
++fit;
}
cout << endl;
}
}
void horsesort(std::vector<int> & myVector)
{
int size = myVector.size();
if (m_debug)
{
cout << "The vector size was: " << myVector.size() << endl;
cout << "Presort: " << endl;
for(int i = 0; i < myVector.size(); i++)
{
cout << myVector[i] << ' ' ;
}
cout << endl;
}
std::list<Horse> corral;
bool done = false;
int i = 0;
//int j = 0;
bool growing = false;
bool shrinking = false;
int start = i; // start vs end;
int end = i;
int last ; // as in the "last" the most recent former number.
int current = myVector[i];
while (!done)
{
last = current;
end = i;
i++;
if( i >= size)
{
done = true;
if(shrinking)
{
reverse(myVector.begin() + start, myVector.begin() + end + 1);
//we call reverse on the list
}
corral.push_back (Horse(start,end));
housekeeping(true, corral, myVector);
break; //
}
current = myVector[i];
if(!growing && ! shrinking)
{
if(current > last)
growing = true;
if(current < last)
shrinking = true;
}
if(growing && current < last)
{
//we need to make a horse
corral.push_back (Horse(start,end));
growing = false;
start = i;
housekeeping(false, corral, myVector);
}
if(shrinking && current > last)
{
//we need to make a horse
reverse(myVector.begin() + start, myVector.begin() + end + 1);
corral.push_back (Horse(start,end));
shrinking = false;
start = i;
housekeeping(false, corral, myVector);
}
}
if (m_debug)
{
cout << "Postsort: " << endl;
for(int i = 0; i < myVector.size(); i++)
{
cout << myVector[i] << ' ' ;
}
cout << endl;
}
}
std::vector<int> partsorted(int mysize, int batch)
{
//cout << "start of partsorted. " << batch ;
std::vector<int> myvector;
for(int i = 0; i < mysize; i++)
{
myvector.push_back(i);
}
std::random_shuffle ( myvector.begin(), myvector.end(), myrandom);
std::sort (myvector.begin(), myvector.begin() + myvector.size() - batch);
// arr now is a list of all the numbers with a few numbers extracted and at the end.
return myvector;
}
int main()
{
const int mysize = 3000000;
const int small_batch = 30000;
unsigned int seed = 465;
seed = time(NULL);
srand (seed);
std::vector<int> myVector;
std::vector<int> myVectorInorder;
cout << "starting:" << endl;
for(int i = 0; i < mysize; i++)
{
int temp = i;
myVector.push_back(temp);
}
myVectorInorder = myVector;
std::random_shuffle ( myVector.begin(), myVector.end(), myrandom);
std::vector<int> horse_sort = myVector;
std::vector<int> normal_sort = myVector;
std::vector<int> horse_partsorted = partsorted(mysize, small_batch);
std::vector<int> normal_partsorted = horse_partsorted;
std::vector<int> horse_inorder = myVectorInorder;
std::vector<int> normal_inorder = myVectorInorder;
Timer timer_normal;
Timer timer_horse;
Timer timer_normal_partsorted;
Timer timer_horse_partsorted;
Timer timer_normal_inorder;
Timer timer_horse_inorder;
Timer verification;
timer_normal.start();
std::sort (normal_sort.begin(), normal_sort.end());
timer_normal.stop();
timer_horse.start();
horsesort(horse_sort);
timer_horse.stop();
timer_normal_partsorted.start();
std::sort (normal_partsorted.begin(), normal_partsorted.end());
timer_normal_partsorted.stop();
timer_horse_partsorted.start();
horsesort(horse_partsorted);
timer_horse_partsorted.stop();
timer_normal_inorder.start();
std::sort (normal_inorder.begin(), normal_inorder.end());
timer_normal_inorder.stop();
timer_horse_inorder.start();
horsesort(horse_inorder);
timer_horse_inorder.stop();
verification.start();
bool fail = false;
int last = horse_inorder[0];
for(int i = 0; i < horse_inorder.size(); i++)
{
if(last <= horse_inorder[i])
{
// we are ok
last = horse_inorder[i];
}
else
fail = true;
}
verification.stop();
cout << "The vector size was: " << mysize << endl;
cout << "Time taken by normal sort is : " << timer_normal << endl;
cout << "Time taken by horse sort is : " << timer_horse << endl;
cout << "Time taken by normal sort for partsorted : " << timer_normal_partsorted << endl;
cout << "Time taken by horse sort for partsorted : " << timer_horse_partsorted << endl;
cout << "Time taken by normal sort for sorted array : " << timer_normal_inorder << endl;
cout << "Time taken by horse sort for sorted array : " << timer_horse_inorder << endl;
system("pause");
return 0;
}