/***************************************************************************
                          parallelintransitivesorting.cpp  - 
		Parallel Sorting of an Intransitive Total Ordered Set
                             -------------------
    begin                : Fri Dec  5 13:55:54 EST 2003
    copyright            : (C) |2003| by Johannes Singler
    email                : johannes@singler.name
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdlib.h>                                    
#include <math.h>
#include <time.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/times.h>

#define bool int
#define true 1
#define false 0

typedef unsigned long uint32;

//please set the following options with the compile statement

//print lots of debug output
//#define DEBUG_OUTPUT
//input by a quadratic matrix
//#define MATRIX_INPUT
//one byte per game (faster)
//#define BYTE
//or only one bit (slower)
//#define BIT
//randomize input data (for MATRIX_INPUT)
//#define RANDOM_INPUT
//calculate input
//#define CALCULATED_INPUT
//do not use pthread conditions to synchronize
//#define BUSY_WAITING
//output the input data for debugging purposes
//#define DATA_OUTPUT

//number of players, bottom element index (looses all matches)
long n, bottom;
//the sequence to sort, temporary memory
long* sequence, * winnerSeq, * loserSeq;
//number of processors
int p;
//all the threads
pthread_t* threadPool;
struct tms dummy;

//what should the thread do next?
typedef enum {stop, wait, fmp, divide1, divide2, dacs} Order;

//the "local" variables for each thread
typedef struct Local
{
  //still busy?
  bool running;
  //what to do next?
  Order order;

  //parameters analog to the function argument
  long begin, end, subBegin, subEnd, numWinners, numLosers, winnerStart, loserStart, mediocrePlayer;
  long* mediocrePlayerPointer, * mediocrePlayerWins;
  int threadBegin, threadEnd;
} LOCAL;

#ifdef MATRIX_INPUT
#ifdef BYTE
char* tournament;
#define BEATS(i,j) (tournament[(j) * (n + 1) + (i)] == 'w')
#define GAME(i,j) (tournament[(j) * (n + 1) + (i)])
#endif

#ifdef BIT
uint32* tournament;
#define BEATS(i,j) (tournament[((j) * (n + 1) + (i)) / 32] & (1 << (((j) * (n + 1) + (i)) % 32)))
#define SETGAME(i,j,o) (tournament[((j) * (n + 1) + (i)) / 32] |= (o << (((j) * (n + 1) + (i)) % 32)))
#endif

//generate a random data set
void GenerateDataSet()
{
  srand(time(NULL));

#ifdef BYTE
  tournament = (char*) malloc((n + 1) * (n + 1) * sizeof(char));
  long i, j;
  for(i = 0; i < n; i++)
  {
    GAME(i,i) = 's';
    for(j = i + 1; j < n; j++)
    {
      bool b = (rand() % 2 == 0);
      GAME(i,j) = b ? 'w' : 'l';
      GAME(j,i) = b ? 'l' : 'w';
    }
  }
  for(i = 0; i < n; i++)
  {
    GAME(i,n) = 'w';
    GAME(n,i) = 'l';
  }
  GAME(n,n) = 's';
#endif
#ifdef BIT
  tournament = (uint32*) malloc((((n + 1) * (n + 1) + 31) / 32) * sizeof(uint32));
  long i, j;
  for(i = 0; i < n; i++)
  {
    SETGAME(i,i,0);
    for(j = i + 1; j < n; j++)
    {
      bool b = (rand() % 2 == 0);
      SETGAME(i,j,(b ? 1 : 0));
      SETGAME(j,i,(b ? 0 : 1));
    }
  }
  for(i = 0; i < n; i++)
  {
    SETGAME(i,n,1);
    SETGAME(n,i,0);
  }
  SETGAME(n,n,0);
#endif

}
#endif

//calculate outcome of a game deterministically
#ifdef CALCULATED_INPUT
bool Beats(long a, long b)
{
  if(a == n) return false;
  if(b == n) return true;
  if(a == b) return false;

  return ((a + b) % 3 == 0 || (a + b) % 6 == 1) ^ (a < b);
}

#define BEATS(a,b) Beats(a, b)
#endif

//output input data
void OutputDataSet()
{
#ifdef DATA_OUTPUT
  long j, i;
  for(j = 0; j < n; j++)
  {
    for(i = 0; i < n; i++)
    {
      putchar(BEATS(i,j) ? 'w':'l');
      putchar(' ');
    }
    putchar('\n');
  }
#endif
}

#define L(i) ((i) * 2 + 1)
#define R(i) ((i) * 2 + 2)

//decides whether t is an element of max(t, l, r)
bool Max(long t, long l, long r)
{
  return (BEATS(t,l) && BEATS(t,r)) || (BEATS(t,l) && BEATS(l,r)) || (BEATS(t,r)  && BEATS(r,l));
}

void SemiHeapify(long i, long* A)
{
  if(A[i] == bottom) return;
  if(!Max(A[i], A[L(i)], A[R(i)]))
  {
    long winner;
    if(Max(A[L(i)], A[R(i)], A[i])) winner = L(i);
    else winner = R(i);

    long h = A[i];
    A[i] = A[winner];
    A[winner] = h;

    SemiHeapify(winner, A);
  }
}

//build up the semi-heap
void BuildSemiHeap(long* A, long length)
{
  long i;
  for(i = length / 2 - 1; i >= 0; i--)
    SemiHeapify(i, A);
}

//replace an element in the semi-heap
void Replace(long* A, long i)
{
  if(A[L(i)] == bottom && A[R(i)] == bottom)
  {
    A[i] = bottom;
    return;
  }

  if(BEATS(A[i],A[L(i)]) && BEATS(A[L(i)],A[R(i)]))
  {
    A[i] = A[L(i)];
    Replace(A, L(i));
  }
  else
  {
    A[i] = A[R(i)];
    Replace(A, R(i));
  }
}

//remove the root element and replace it recursively
long Pop(long* A)
{
  long result = A[0];

  Replace(A, 0);

  return result;
}

//sort sequentially
void SemiHeapSorting(long begin, long end)
{
  long* A;
  long length = end - begin;
  bottom = n;

  time_t start = times(&dummy);

  //int depth = (int) (log(length + 0.5) / log(2)) + 2;

  //long arrayLength = (1 << depth) - 1;
  long arrayLength = 2 * length + 2;
  A = (long*) malloc(sizeof(long) * arrayLength);
  long i;
  for(i = 0; i < length; i++)
    A[i] = sequence[begin + i];
  for(i = length; i < arrayLength; i++)
    A[i] = bottom;

  BuildSemiHeap(A, length);

  for(i = begin; i < end; i++)
    sequence[i] = Pop(A);


  time_t stop = times(&dummy);
  
#ifdef TIMING_OUTPUT
  printf("Sorting using semi-heap for %d elements took %f s.\n", length, ((double) (stop - start)) / (double) CLK_TCK);
#endif

  free(A);
}

//mutexed to protect mediocrePlayer and updateCond respectively
pthread_mutex_t mpMutex, updateMutex;
pthread_cond_t updateCond;

//array with one element for each thread
struct Local* locals;

//find mediocre player in subset
void FindMediocrePlayer(long begin, long end, long subBegin, long subEnd, long* mediocrePlayerPointer, long* mediocrePlayerWins)
{
  long player, length = end - begin, gamesWon, feasiblePlayer = -1;
  long i, j, goodDivision = (double) length / 2.2, feasiblePlayerWins = length / 4;
  for(i = subBegin; i < subEnd; i++)
  {
    if((*mediocrePlayerPointer) != -1) return;
    player = sequence[i];
    gamesWon = 0;
    for(j = begin; j < end; j++)
    {
      if(BEATS(player,sequence[j]))
        gamesWon++;
    }
	if((gamesWon >= goodDivision) && (length - gamesWon >= goodDivision))
    {
      pthread_mutex_lock(&mpMutex);
      *mediocrePlayerPointer = i;
      *mediocrePlayerWins = gamesWon;
      pthread_mutex_unlock(&mpMutex);
      return;
    }
	if((gamesWon >= feasiblePlayerWins) && (length - gamesWon >= feasiblePlayerWins))
    {
      feasiblePlayer = i;
      feasiblePlayerWins = gamesWon;
    }
	if(i >= 100 && feasiblePlayer != - 1) break;
  }
  pthread_mutex_lock(&mpMutex);
  *mediocrePlayerPointer = feasiblePlayer;
  *mediocrePlayerWins = feasiblePlayerWins;
  pthread_mutex_unlock(&mpMutex);
}

//count wins and losses against the medicre player and move data to temporary memory accordingly
void Divide1(long subBegin, long subEnd, long mediocrePlayer, int threadNo)
{
  locals[threadNo].numWinners = 0;
  locals[threadNo].numLosers = 0;
  int i;
  for(i = subBegin; i < subEnd; i++)
  {
    if(sequence[i] == mediocrePlayer) continue;
    if(BEATS(sequence[i],mediocrePlayer)) winnerSeq[subBegin + locals[threadNo].numWinners++] = sequence[i];
    else loserSeq[subBegin + locals[threadNo].numLosers++] = sequence[i];
  }
}

//arrange data from temporary memory
void Divide2(long subBegin, long winnerStart, long loserStart, long numWinners, long numLosers)
{
  long pos = winnerStart;
  int w, l;
  for(w = 0; w < numWinners; w++)
    sequence[pos++] = winnerSeq[subBegin + w];
  pos = loserStart;
  for(l = 0; l < numLosers; l++)
    sequence[pos++] = loserSeq[subBegin + l];
}

#define SWAP(x,y) {long h; h = (x); (x) = (y); (y) =h;}

//wait for threads begin ... (end - 1)
void WaitForThreads(int begin, int end)
{
  int t;
  for(t = begin; t < end; t++)  //wait for threads to stop
  {
    while(locals[t].running)
    {
#ifndef BUSY_WAITING
      pthread_mutex_lock(&updateMutex);
      pthread_cond_wait(&updateCond, &updateMutex);
      pthread_mutex_unlock(&updateMutex);
#endif
    }
  }
}

//sort by divide-and-conquer in parallel
void DivideAndConquerSort(long begin, long end, int threadBegin, int threadEnd)
{
#ifdef DEBUG_OUTPUT
    printf("%ld, %ld, %d, %d, %ld\n", begin, end, threadBegin, threadEnd, pthread_self());
#endif
  long length = end - begin;
  int numThreads = threadEnd - threadBegin;

  if(length < 2) return;

  if(numThreads == 1)
  {
#ifdef TIMING_OUTPUT
	printf("%d: %d\n", threadBegin, length);
#endif
    SemiHeapSorting(begin, end);
  }
  else
  {
    //Find mediocre player in parallel

    long mediocrePlayer = -1, mediocrePlayerWins;
    int t;
    for(t = 1; t < numThreads; t++)  //start threads
    {
      long subBegin = (long) (t * (double) length / (double) numThreads) + begin, subEnd = (long) ((t + 1) * (double) length / (double) numThreads) + begin;
      locals[threadBegin + t].begin = begin;
      locals[threadBegin + t].end = end;
      locals[threadBegin + t].subBegin = subBegin;
      locals[threadBegin + t].subEnd = subEnd;
      locals[threadBegin + t].mediocrePlayerPointer = &mediocrePlayer;
      locals[threadBegin + t].mediocrePlayerWins = &mediocrePlayerWins;
      locals[threadBegin + t].running = true;
      locals[threadBegin + t].order = fmp;
    }

    pthread_cond_broadcast(&updateCond);
    FindMediocrePlayer(begin, end, begin, (long) ((double) length / (double) numThreads) + begin, &mediocrePlayer, &mediocrePlayerWins);

    WaitForThreads(threadBegin + 1, threadEnd);

    //Divide step in parallel


    //first part: seperate into winnerSeq and loserSeq
    long pos = begin + length - mediocrePlayerWins - 1;
    SWAP(sequence[mediocrePlayer], sequence[pos]);

#ifdef DEBUG_OUTPUT
    long i;
    for(i = 0; i < n; i++)
      printf("%ld ", sequence[i]);
    putchar('\n');
#endif

    for(t = 1; t < numThreads; t++)  //start threads
    {
      long subBegin = (long) (t * (double) length / (double) numThreads) + begin, subEnd = (long) ((t + 1) * (double) length / (double) numThreads) + begin;
      locals[threadBegin + t].subBegin = subBegin;
      locals[threadBegin + t].subEnd = subEnd;
      locals[threadBegin + t].mediocrePlayer = sequence[pos];
      locals[threadBegin + t].running = true;
      locals[threadBegin + t].order = divide1;
    }
    pthread_cond_broadcast(&updateCond);
    Divide1(begin, (long) ((double) length / (double) numThreads) + begin, sequence[pos], threadBegin);

    WaitForThreads(threadBegin + 1, threadEnd);

    //second part: regroup

    long winnerPos = begin + locals[threadBegin].numWinners, loserPos = begin + (length - mediocrePlayerWins) + locals[threadBegin].numLosers;
    for(t = threadBegin + 1; t < threadEnd; t++)
    {
      locals[t].winnerStart = winnerPos;
      winnerPos += locals[t].numWinners;
      locals[t].loserStart = loserPos;
      loserPos += locals[t].numLosers;
      locals[t].running = true;
      locals[t].order = divide2;
    }

    pthread_cond_broadcast(&updateCond);
    Divide2(begin, begin, begin + (length - mediocrePlayerWins), locals[threadBegin].numWinners, locals[threadBegin].numLosers);

    WaitForThreads(threadBegin + 1, threadEnd);

#ifdef DEBUG_OUTPUT
    long i;
    for(i = 0; i < n; i++)
      printf("%ld ", sequence[i]);
    putchar('\n');
#endif

    //conquer recursive in parallel

    int beginNextGroup = threadBegin + numThreads / 2;

	if(beginNextGroup <= threadBegin) beginNextGroup = threadBegin + 1;
	if(beginNextGroup >= threadEnd) beginNextGroup = threadEnd - 1;
    locals[beginNextGroup].begin = pos + 1;
    locals[beginNextGroup].end = end;
    locals[beginNextGroup].threadBegin = beginNextGroup;
    locals[beginNextGroup].threadEnd = threadEnd;
    locals[beginNextGroup].running = true;
    locals[beginNextGroup].order = dacs;
    pthread_cond_broadcast(&updateCond);

    DivideAndConquerSort(begin, pos, threadBegin, beginNextGroup);

    void* dummy;
    pthread_join(threadPool[beginNextGroup], &dummy);
  }
}

//is the result correct?
bool CheckResult()
{
  bool result = true;
  long i;
  for(i = 0; i < n - 1; i++)
  {
#ifdef DEBUG_OUTPUT
    printf("%ld", sequence[i]);
#endif
    if(BEATS(sequence[i],sequence[i+1]))
    {
#ifdef DEBUG_OUTPUT
    printf("->");
#endif
    }
    else
    {
#ifdef DEBUG_OUTPUT
      printf("<-");
#endif
      result = false;
    }
  }
#ifdef DEBUG_OUTPUT
  printf("%ld\n", sequence[n - 1]);
#endif
  return result;
}

//hte main thread function, scheduling of the work takes place here
void* Work(void* knowledge)
{
  int number = (int) knowledge;

  struct Local* myLocal = &(locals[number]);

  while(myLocal->order != stop)
  {
    if(myLocal->order == wait || !myLocal->running)
    {
#ifndef BUSY_WAITING
      pthread_mutex_lock(&updateMutex);
      pthread_cond_wait(&updateCond, &updateMutex);
      pthread_mutex_unlock(&updateMutex);
#endif
    }
    else
    {
      switch(myLocal->order)
      {
        case fmp:
          FindMediocrePlayer(myLocal->begin, myLocal->end, myLocal->subBegin, myLocal->subEnd, myLocal->mediocrePlayerPointer, myLocal->mediocrePlayerWins);
          myLocal->order = wait;
        break;
        case divide1:
          Divide1(myLocal->subBegin, myLocal->subEnd, myLocal->mediocrePlayer, number);
          myLocal->order = wait;
        break;
        case divide2:
          Divide2(myLocal->subBegin, myLocal->winnerStart, myLocal->loserStart, myLocal->numWinners, myLocal->numLosers);
          myLocal->order = wait;
        break;
        case dacs:
          DivideAndConquerSort(myLocal->begin, myLocal->end, myLocal->threadBegin, myLocal->threadEnd);
          myLocal->order = stop;
        break;
        default:
        break;
      }
      myLocal->running = false;
      pthread_cond_broadcast(&updateCond);
    }
  }

  pthread_cond_broadcast(&updateCond);
  return NULL;
}

//the main function
int main(int argc, char *argv[])
{
  if(argc != 3)
  {
    printf("Usage: parallelintransitivesorting n p\n");
    return EXIT_FAILURE;
  }

  sscanf(argv[1], "%ld", &n);
  sscanf(argv[2], "%d", &p);

#ifdef MATRIX_INPUT
#ifdef RANDOM_INPUT
  GenerateDataSet();
#endif
#endif

  OutputDataSet();

  const int numRuns = 1;

  sequence = (long*) malloc(sizeof(long) * n);
  winnerSeq = (long*) malloc(sizeof(long) * n);
  loserSeq = (long*) malloc(sizeof(long) * n);

  threadPool = (pthread_t*) malloc(sizeof(pthread_t) * p);
  locals = (struct Local*) malloc(sizeof(struct Local) * p);

  threadPool[0] = pthread_self();

  pthread_mutex_init(&mpMutex, NULL);
  pthread_mutex_init(&updateMutex, NULL);
  pthread_cond_init(&updateCond, NULL);

  int cp;
  for(cp = 1; cp <= p; cp++)
  {
    time_t start = times(&dummy);

    int r, t;
    for(r = 0; r < numRuns; r++)
    {
      long i;
      for(i = 0; i < n; i++)
        sequence[i] = i;

      for(t = 1; t < cp; t++)
      {
        locals[t].order = wait;
        pthread_create(&(threadPool[t]), NULL, Work, (void*) t);
      }
      DivideAndConquerSort(0, n, 0, cp);
#ifdef DEBUG_OUTPUT    
    printf("-------------\n");
#endif
	}
    time_t stop = times(&dummy);
  
    printf("Sorting with %d threads took %f s per run.\n", cp, ((double) (stop - start)) / ((double) r * (double) CLK_TCK));
    if(!CheckResult())
      printf("Result is wrong!\n");
  }

  pthread_cond_destroy(&updateCond);
  pthread_mutex_destroy(&updateMutex);
  pthread_mutex_destroy(&mpMutex);

  free(sequence);
  free(winnerSeq);
  free(loserSeq);
#ifdef MATRIX_INPUT
  free(tournament);
#endif
  return EXIT_SUCCESS;
}

