static char rcsid[] = "$Id: sam_sort.c 223785 2020-12-16 15:27:18Z twu $";
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>		/* For off_t */
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "bool.h"
#include "mem.h"
#include "access.h"
#ifdef LOOKUP_SEQS
#include "complement.h"
#endif

#include "genomicpos.h"
#include "samheader.h"
#include "samread.h"
#include "samflags.h"
#include "stopwatch.h"
#include "datadir.h"
#include "filestring.h"
#include "getopt.h"


/* #define VERIFY_HASHSEQ 1 */

/* sam-to-bam conversions always need the headers */
#define SAM_HEADERS_ON_EMPTY_FILES 1


/************************************************************************
 *
 *  Check for correctness:
 *
 *    Run ./sam_sort -d <genome> --mark-dups --mark-first --dups-only --no-sam-headers on a SAM file
 *    Do "cut -f 1 | sort | uniq" to find all duplicate accessions
 *    
 *    Run the following Perl program on the original FASTQ files
 *    Do "grep '^>' | sed -e 's/>//' | sort | uniq" to use as a gold standard
 * 
 * use IO::File;
 * $fastq1 = $ARGV[0];
 * $fastq2 = $ARGV[1];
 * 
 * $FP1 = new IO::File($fastq1);
 * $FP2 = new IO::File($fastq2);
 * while (defined($line1 = <$FP1>) && defined($line2 = <$FP1>) && 
 *        defined($line3 = <$FP1>) && defined($line4 = <$FP1>) &&
 *        defined($line5 = <$FP2>) && defined($line6 = <$FP2>) && 
 *        defined($line7 = <$FP2>) && defined($line8 = <$FP2>)) {
 *     chop $line2; chop $line6;
 *     $queryseq5 = $line2; $queryseq3 = $line6;
 * 
 *     if ($queryseq5 lt $queryseq3) {
 *       $nseen{$queryseq5 . $queryseq3} += 1;
 *     } else {
 *       $nseen{$queryseq3 . $queryseq5} += 1;
 *     }
 * }
 * close($FP2);
 * close($FP1);
 * 
 * $FP1 = new IO::File($fastq1);
 * $FP2 = new IO::File($fastq2);
 * while (defined($line1 = <$FP1>) && defined($line2 = <$FP1>) && 
 *        defined($line3 = <$FP1>) && defined($line4 = <$FP1>) &&
 *        defined($line5 = <$FP2>) && defined($line6 = <$FP2>) && 
 *        defined($line7 = <$FP2>) && defined($line8 = <$FP2>)) {
 *     chop $line2; chop $line6;
 *     $queryseq5 = $line2; $queryseq3 = $line6;
 * 
 *     if ($queryseq5 lt $queryseq3) {
 * 	$queryseq = $queryseq5 . $queryseq3;
 *     } else {
 * 	$queryseq = $queryseq3 . $queryseq5;
 *     }
 * 
 *     if (($n = $nseen{$queryseq}) > 1) {
 *       ($header) = $line1 =~ /^@(\S+)/;
 *  	 print ">" . $header . "\n"; print $queryseq5 . "\n"; print $queryseq3 . "\n";
 *     }
 * }
 * close($FP2);
 * close($FP1);
 * 
 * exit;
 * 
 ************************************************************************/


typedef enum {NO_SECONDARY_SORT, ORIG_SECONDARY_SORT, ACC_SECONDARY_SORT, MATEFWD_SECONDARY_SORT, MATEREV_SECONDARY_SORT} Secondary_sort_T;

#ifdef DEBUG
#define debug(x) x
#else
#define debug(x)
#endif

/* Details of comparing sequences */
#ifdef DEBUG8
#define debug8(x) x
#else
#define debug8(x)
#endif

/* Details of getting queryseqs */
#ifdef DEBUG9
#define debug9(x) x
#else
#define debug9(x)
#endif

/* Cell_binary_search */
#ifdef DEBUG10
#define debug10(x) x
#else
#define debug10(x)
#endif

/* Testing of linelen algorithm */
#ifdef DEBUG14
#define debug14(x) x
#else
#define debug14(x)
#endif



#ifdef HAVE_FSEEKO
#define moveto(fp,offset) fseeko(fp,offset,SEEK_SET)
#else
#define moveto(fp,offset) fseek(fp,offset,SEEK_SET)
#endif



/* Program Options */
static char *genomesubdir = NULL;
static char *dbroot = NULL;
static char *dbversion = NULL;
static char *user_genomedir = NULL;

static bool sam_headers_p = true;

static bool detect_duplicates_p = false;
static bool mark_duplicates_p = false;
static bool mark_first_p = false;
static bool print_unique_p = true;
static bool print_duplicates_p = true;
static bool restore_original_order_p = false;

static bool secondary_sort_method = NO_SECONDARY_SORT;
static bool multiple_primaries_p = false;

static Stopwatch_T stopwatch = NULL;

static char *split_output_root = NULL;
static bool any_circular_p = false;
static bool appendp = false;
static FILE **outputs = NULL;

static int output_nlines = 10000000; /* 10 million */


static struct option long_options[] = {
  /* Input options */
  {"dir", required_argument, 0, 'D'}, /* user_genomedir */
  {"db", required_argument, 0, 'd'}, /* dbroot */
  {"split-output", required_argument, 0, 0}, /* outputs */
  {"append-output", no_argument, 0, 0},	     /* appendp */

  {"sort2", required_argument, 0, 0}, /* secondary_sort_method */

  {"mark-dups", no_argument, 0, 0}, /* detect_duplicates_p, mark_duplicates_p, print_unique_p, print_duplicates_p */
  {"mark-first", no_argument, 0, 0}, /* mark_first_p */
  {"dups-only", no_argument, 0, 0}, /* detect_duplicates_p, print_unique_p, print_duplicates_p */
  {"uniq-only", no_argument, 0, 0}, /* detect_duplicates_p, print_unique_p, print_duplicates_p */
  {"restore-orig-order", no_argument, 0, 0}, /* restore_original_order_p */
  {"multiple-primaries", no_argument, 0, 0}, /* multiple_primaries_p */
  {"no-sam-headers", no_argument, 0, 0},     /* sam_headers_p */

  {"output-buffer", required_argument, 0, 0}, /* output_nlines */

  /* Help options */
  {"version", no_argument, 0, '^'}, /* print_program_version */
  {"help", no_argument, 0, '?'}, /* print_program_usage */
  {0, 0, 0, 0}
};


static void
print_program_version () {
  fprintf(stdout,"\n");
  fprintf(stdout,"sam_sort: sorting and duplication-marking utility for SAM files\n");
  fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
  fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
  fprintf(stdout,"Contact: twu@gene.com\n");
  fprintf(stdout,"\n");
  return;
}

static void
print_program_usage () {
  fprintf(stdout,"\
Usage: sam_sort [OPTIONS...] -d genome <sam file>\n\
\n\
Input options\n\
  -D, --dir=STRING        Genome directory\n\
  -d, --db=STRING         Genome database.  If argument is '?' (with\n\
                            the quotes), this command lists available databases.\n\
Output file options\n\
  --split-output=STRING   Basename for multiple-file output, separately for nomapping,\n\
                            halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult,\n\
                            paired_uniq, paired_mult, concordant_uniq, and concordant_mult results\n\
  --append-output         When --split-output is given, this flag will append output\n\
                            to the existing files.  Otherwise, the default is to create new files.\n\
\n\
Other options\n\
  --sort2=STRING          For positions with the same genomic position, sort secondarily by\n\
                             none: no guarantee about the secondary sort (default)\n\
                             orig: original order in the SAM output file (what samtools sort does)\n\
                             accession: alphabetically by accession name\n\
                             mate-fwd: by genomic position of the mate, in ascending order\n\
                             mate-rev: by genomic position of the mate, in descending order\n\
                          For sorting by mate genomic position, a nomapping mate is treated as genomic position 0\n\
  --mark-dups             Mark duplicate reads by altering the flag accordingly\n\
  --mark-first            Also mark the first occurrence of a read that has a subsequent duplicate\n\
\n\
  --dups-only             Print only duplicate reads\n\
  --uniq-only             Print only unique reads\n\
  --restore-orig-order    Restore original order of SAM file.  Useful when --mark-dups, --dups-only, or --uniq-only\n\
                            is specified, and sorting is not desired\n\
  --multiple-primaries    Specify if GSNAP or GMAP was run with the --multiple-primaries flag\n\
  --no-sam-headers        Do not print SAM header lines\n\
  --output-buffer=INT     Size of output buffer kept in RAM for printing (default 10000000, or 10 million SAM lines)\n\
");
  return;
}



static void
print_file_headers (FILE *fp_output, Filestring_T headers) {

  if (fp_output == NULL) {
    /* Possible since we are no longer creating a file for OUTPUT_NONE */
    return;
  } else {
    Filestring_print(fp_output,headers);
  }

  return;
}


/* Open empty files, and add SAM headers if SAM_HEADERS_ON_EMPTY_FILES is set */
static void
touch_all_single_outputs (FILE **outputs, char *split_output_root, Filestring_T headers, bool paired_end_p, bool appendp) {
  SAM_split_output_type split_output;

  split_output = 1;
  while (split_output <= N_SPLIT_OUTPUTS_SINGLE_STD) {
    if (outputs[split_output] == NULL) {
      outputs[split_output] = SAM_header_fopen(split_output,split_output_root,paired_end_p,appendp);
#ifdef SAM_HEADERS_ON_EMPTY_FILES
      if (sam_headers_p == true) {
	print_file_headers(outputs[split_output],headers);
      }
#endif
    }
    split_output++;
  }

  if (any_circular_p == false) {
    split_output = N_SPLIT_OUTPUTS_SINGLE_TOCIRC + 1;
  } else {
    while (split_output <= N_SPLIT_OUTPUTS_SINGLE_TOCIRC) {
      if (outputs[split_output] == NULL) {
	outputs[split_output] = SAM_header_fopen(split_output,split_output_root,paired_end_p,appendp);
#ifdef SAM_HEADERS_ON_EMPTY_FILES
        print_file_headers(outputs[split_output],headers);
#endif
      }
      split_output++;
    }
  }

  return;
}


/* Open empty files, and add SAM headers if SAM_HEADERS_ON_EMPTY_FILES is set */
static void
touch_all_paired_outputs (FILE **outputs, char *split_output_root, Filestring_T headers, bool appendp) {
  SAM_split_output_type split_output;

  split_output = N_SPLIT_OUTPUTS_SINGLE + 1;
  while (split_output <= N_SPLIT_OUTPUTS_STD) {
    if (outputs[split_output] == NULL) {
      outputs[split_output] = SAM_header_fopen(split_output,split_output_root,/*paired_end_p*/true,appendp);
#ifdef SAM_HEADERS_ON_EMPTY_FILES
      if (sam_headers_p == true) {
	print_file_headers(outputs[split_output],headers);
      }
#endif
    }
    split_output++;
  }

  return;
}


static bool
paired_outputs_p (FILE **outputs) {
  SAM_split_output_type split_output;

  split_output = N_SPLIT_OUTPUTS_SINGLE + 1;
  while (split_output <= N_SPLIT_OUTPUTS) {
    if (outputs[split_output] != NULL) {
      return true;
    }
    split_output++;
  }

  return false;
}


static void
touch_all_files (FILE **outputs, char *split_output_root, Filestring_T headers, bool appendp) {
  if (paired_outputs_p(outputs) == false) {
    touch_all_single_outputs(outputs,split_output_root,headers,/*paired_end_p*/false,appendp);
  } else {
    touch_all_single_outputs(outputs,split_output_root,headers,/*paired_end_p*/true,appendp);
    touch_all_paired_outputs(outputs,split_output_root,headers,appendp);
  }
  return;
}


/************************************************************************/


#ifdef LOOKUP_SEQS
static char complCode[128] = COMPLEMENT_LC;

static void
make_complement_inplace (char *sequence, unsigned int length) {
  char temp;
  unsigned int i, j;

  for (i = 0, j = length-1; i < length/2; i++, j--) {
    temp = complCode[(int) sequence[i]];
    sequence[i] = complCode[(int) sequence[j]];
    sequence[j] = temp;
  }
  if (i == j) {
    sequence[i] = complCode[(int) sequence[i]];
  }

  return;
}
#endif



#define T Cell_T
typedef struct T *T;
struct T {
  char *acc;			/* Needed for ACC_SECONDARY_SORT */

  int lineindex;		/* Original line order */
  int readindex;		/* inputi or outputi.  Grouped by accession.  Needed for marking duplicates to find the other queryseq */

  Univcoord_T highpos;          /* genomicpos - initial_softclip + readlength.  Add readlength to avoid negative coordinates */
  Univcoord_T genomicpos;
  Univcoord_T mate_genomicpos;	/* Needed for some secondary sorts */

  Hashseq_T hashseq_min;	/* Lower numeric value of hashseq5 and hashseq3 */
  Hashseq_T hashseq_max;	/* High numeric value of hashseq5 and hashseq3 */

#ifdef VERIFY_HASHSEQ
  char *queryseq_alpha;	/* Earlier alphabetic order of queryseq5 and queryseq3 */
  char *queryseq_omega;	/* Later alphabetic order of queryseq5 and queryseq3 */

  bool fillp;		/* Indicates that queryseq_alpha and queryseq_omega are filled */
  bool firstp;		/* Indicates a single line where queryseqs were assigned for the read  */
#endif
};


static void
Cell_fill_hashseqs (T this, Hashseq_T hashseq5, Hashseq_T hashseq3) {

  if (hashseq5 == 0 && hashseq3 == 0) {
    this->hashseq_min = this->hashseq_max = 0;
  } else if (hashseq5 == 0) {
    this->hashseq_min = hashseq3;
    this->hashseq_max = 0;
  } else if (hashseq3 == 0) {
    this->hashseq_min = hashseq5;
    this->hashseq_max = 0;
  } else if (hashseq5 < hashseq3) {
    this->hashseq_min = hashseq5;
    this->hashseq_max = hashseq3;
  } else {
    this->hashseq_min = hashseq3;
    this->hashseq_max = hashseq5;
  }

  return;
}


#ifdef VERIFY_HASHSEQ
static void
Cell_fill_queryseqs (T this, char *queryseq5, char *queryseq3, bool firstp) {

  /* printf("Filling %s with %s and %s\n",this->acc,queryseq5,queryseq3); */

  if (queryseq5 == NULL && queryseq3 == NULL) {
    this->queryseq_alpha = this->queryseq_omega = (char *) NULL;
  } else if (queryseq5 == NULL) {
    this->queryseq_alpha = queryseq3;
    this->queryseq_omega = NULL;
  } else if (queryseq3 == NULL) {
    this->queryseq_alpha = queryseq5;
    this->queryseq_omega = NULL;
  } else if (strcmp(queryseq5,queryseq3) < 0) {
    this->queryseq_alpha = queryseq5;
    this->queryseq_omega = queryseq3;
  } else {
    this->queryseq_alpha = queryseq3;
    this->queryseq_omega = queryseq5;
  }

  this->fillp = true;
  this->firstp = firstp;

  return;
}
#endif
    

/* initial_softclip needs to be determined only if we are marking duplicates */
static void
Cell_fill_for_dedup (struct T *this, int lineindex, int readindex, char *acc,
		     int initial_softclip, int readlength,
		     Univcoord_T genomicpos, Univcoord_T mate_genomicpos) {

  this->acc = acc;

  this->lineindex = lineindex;
  this->readindex = readindex;

  this->highpos = genomicpos - initial_softclip + readlength;
  this->genomicpos = genomicpos;
  this->mate_genomicpos = mate_genomicpos;

#ifdef VERIFY_HASHSEQ
  this->fillp = false;
  this->firstp = false;

  this->queryseq_alpha = (char *) NULL;
  this->queryseq_omega = (char *) NULL;
#endif

  return;
}

/* initial_softclip needs to be determined only if we are marking duplicates */
static void
Cell_fill_no_dedup (struct T *this, int lineindex, char *acc,
		    Univcoord_T genomicpos, Univcoord_T mate_genomicpos) {

  this->acc = acc;

  this->lineindex = lineindex;
  this->readindex = 0;

  this->highpos = 0;		/* Will sort on genomicpos only */
  this->genomicpos = genomicpos;
  this->mate_genomicpos = mate_genomicpos;

  return;
}


#if 0
static void
Cell_print_fromfile (FILE *fp_input, T this, Filestring_T headers) {
  char buffer[CHUNK];
  int linelength = this->linelen;
  bool paired_end_p;
  FILE *fp_output;

#if 0
  if (nofailsp == true && this->split_output == OUTPUT_NM) {
    /* Skip */
    return;

  } else if (failsonlyp == true && this->split_output != OUTPUT_NM &&
	     this->split_output != OUTPUT_HX && this->split_output != OUTPUT_UX &&
	     this->split_output != OUTPUT_PX && this->split_output != OUTPUT_CX) {
    return;
  }

  if (failedinput_root != NULL && primaryp(this->flag) == true) {
    /* Convert SAM line to FASTA or FASTQ and write to a failedinput file */
  }
#endif

  if (split_output_root == NULL) {
    if ((fp_output = outputs[0]) == NULL) {
      fp_output = outputs[0] = stdout;
      Filestring_print(fp_output,headers);
    }

  } else if (this->split_output == OUTPUT_NONE) {
    /* Skip */
    return;

  } else {
    if ((fp_output = outputs[this->split_output]) == NULL) {
      paired_end_p = this->flag & PAIRED_READ;
      fp_output = outputs[this->split_output] = SAM_header_fopen(this->split_output,split_output_root,paired_end_p,appendp);
      Filestring_print(fp_output,headers);
    }
  }

  moveto(fp_input,this->linestart);

#ifdef DEBUG
  printf("readindex %d: ",this->readindex);
#endif

  while (linelength > CHUNK) {
    fread(buffer,sizeof(char),CHUNK,fp_input);
    fwrite(buffer,sizeof(char),CHUNK,fp_output);
    linelength -= CHUNK;
  }
  if (linelength > 0) {
    fread(buffer,sizeof(char),linelength,fp_input);
    fwrite(buffer,sizeof(char),linelength,fp_output);
  }

  return;
}
#endif


/* When restore_original_order_p is true */
static int
Cell_lineindex_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->lineindex < y->lineindex) {
    return -1;
  } else if (y->lineindex < x->lineindex) {
    return +1;
  } else {
    return 0;
  }
}

/* For NO_SECONDARY_SORT */
static int
Cell_genomicpos_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->genomicpos != 0 && y->genomicpos == 0) {
    return -1;
  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
    return +1;
  } else if (x->genomicpos < y->genomicpos) {
    return -1;
  } else if (y->genomicpos < x->genomicpos) {
    return +1;
  } else {
    return 0;
  }
}

/* For ORIG_SECONDARY_SORT */
static int
Cell_genomicpos_lineindex_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->genomicpos != 0 && y->genomicpos == 0) {
    return -1;
  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
    return +1;
  } else if (x->genomicpos < y->genomicpos) {
    return -1;
  } else if (y->genomicpos < x->genomicpos) {
    return +1;
  } else if (x->lineindex < y->lineindex) {
    return -1;
  } else if (y->lineindex < x->lineindex) {
    return +1;
  } else {
    return 0;
  }
}

/* For ACC_SECONDARY_SORT */
static int
Cell_genomicpos_acc_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->genomicpos != 0 && y->genomicpos == 0) {
    return -1;
  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
    return +1;
  } else if (x->genomicpos < y->genomicpos) {
    return -1;
  } else if (y->genomicpos < x->genomicpos) {
    return +1;
  } else {
    return strcmp(x->acc,y->acc);
  }
}

/* For MATEFWD_SECONDARY_SORT */
static int
Cell_genomicpos_matefwd_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->genomicpos != 0 && y->genomicpos == 0) {
    return -1;
  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
    return +1;
  } else if (x->genomicpos < y->genomicpos) {
    return -1;
  } else if (y->genomicpos < x->genomicpos) {
    return +1;
  } else if (x->mate_genomicpos < y->mate_genomicpos) {
    return -1;
  } else if (y->mate_genomicpos < x->mate_genomicpos) {
    return +1;
  } else {
    return 0;
  }
}

/* For MATEREV_SECONDARY_SORT */
static int
Cell_genomicpos_materev_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->genomicpos != 0 && y->genomicpos == 0) {
    return -1;
  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
    return +1;
  } else if (x->genomicpos < y->genomicpos) {
    return -1;
  } else if (y->genomicpos < x->genomicpos) {
    return +1;
  } else if (x->mate_genomicpos > y->mate_genomicpos) {
    return -1;
  } else if (y->mate_genomicpos > x->mate_genomicpos) {
    return +1;
  } else {
    return 0;
  }
}


/* Used for no-mappers, and as secondary sort for Cell_highpos_hashseq_cmp */
static int
Cell_hashseq_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  if (x->hashseq_min != 0 && y->hashseq_min == 0) {
    return -1;
  } else if (x->hashseq_min == 0 && y->hashseq_min != 0) {
    return +1;
  } else if (x->hashseq_min == 0 && y->hashseq_min == 0) {
    return 0;
  } else if (x->hashseq_min < y->hashseq_min) {
    return -1;
  } else if (y->hashseq_min < x->hashseq_min) {
    return +1;
  } else if (x->hashseq_max != 0 && y->hashseq_max == 0) {
    return -1;
  } else if (x->hashseq_max == 0 && y->hashseq_max != 0) {
    return +1;
  } else if (x->hashseq_max != 0 && y->hashseq_max == 0) {
    return 0;
  } else if (x->hashseq_max < y->hashseq_max) {
    return -1;
  } else if (y->hashseq_max < x->hashseq_max) {
    return +1;
  } else {
    return 0;
  }
}

static int
Cell_highpos_hashseq_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  /* Duplicates indicated by highpos of 0.  Put at end. */
  if (x->highpos != 0 && y->highpos == 0) {
    return -1;
  } else if (y->highpos != 0 && x->highpos == 0) {
    return +1;
  } else if (x->highpos < y->highpos) {
    return -1;
  } else if (y->highpos < x->highpos) {
    return +1;

  } else {
    return Cell_hashseq_cmp(a,b);
  }
}


#ifdef VERIFY_HASHSEQ
static int
Cell_queryseq_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;
  int cmp;

  if (x->queryseq_alpha != NULL && y->queryseq_alpha == NULL) {
    return -1;
  } else if (x->queryseq_alpha == NULL && y->queryseq_alpha != NULL) {
    return +1;
  } else if (x->queryseq_alpha == NULL && y->queryseq_alpha == NULL) {
    return 0;
  } else if ((cmp = strcmp(x->queryseq_alpha,y->queryseq_alpha)) != 0) {
    return cmp;
  } else if (x->queryseq_omega != NULL && y->queryseq_omega == NULL) {
    return -1;
  } else if (x->queryseq_omega == NULL && y->queryseq_omega != NULL) {
    return +1;
  } else if (x->queryseq_omega == NULL && y->queryseq_omega == NULL) {
    return 0;
  } else {
    return strcmp(x->queryseq_omega,y->queryseq_omega);
  }
}
#endif


#ifdef VERIFY_HASHSEQ
static int
Cell_highpos_queryseq_cmp (const void *a, const void *b) {
  T x = * (T *) a;
  T y = * (T *) b;

  /* Duplicates indicated by highpos of 0.  Put at end. */
  if (x->highpos != 0 && y->highpos == 0) {
    return -1;
  } else if (y->highpos != 0 && x->highpos == 0) {
    return +1;
  } else if (x->highpos < y->highpos) {
    return -1;
  } else if (y->highpos < x->highpos) {
    return +1;

  } else {
    return Cell_queryseq_cmp(a,b);
  }
}
#endif


#if 0
static int
Cell_binary_search (int lowi, int highi, T *cells, Univcoord_T goal) {
  int middlei;

  debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,goal));

  while (lowi < highi) {
    middlei = lowi + ((highi - lowi) / 2);
    debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
		   lowi,cells[lowi]->genomicpos,middlei,cells[middlei]->genomicpos,
		   highi,cells[highi]->genomicpos,goal));
    if (goal < cells[middlei]->genomicpos) {
      highi = middlei;
    } else if (goal > cells[middlei]->genomicpos) {
      lowi = middlei + 1;
    } else {
      debug10(printf("binary search returns %d\n",middlei));
      /* Rewind to first cell having the goal */
      while (middlei - 1 >= lowi && cells[middlei - 1]->genomicpos == goal) {
	middlei--;
      }
      return middlei;
    }
  }

  debug10(printf("binary search returns %d\n",highi));
  return highi;
}
#endif

#if 0
static int
Cell_find (int lowi, int highi, T *cells, Univcoord_T goal, int readindex) {
  int i;

  i = Cell_binary_search(lowi,highi,cells,goal);
  while (i < highi && cells[i]->genomicpos == goal) {
    if (cells[i]->readindex == readindex) {
      return i;
    } else {
      i++;
    }
  }

  fprintf(stderr,"Cannot find cell in %d..%d with genomicpos %u and readindex %d\n",
	  lowi,highi,goal,readindex);
  return -1;
}
#endif


static void
sort_cells (T *cells, int ncells_total, bool restore_original_order_p,
	    Secondary_sort_T secondary_sort_method, bool detect_dups_p) {

  if (restore_original_order_p == true) {
    if (detect_dups_p == false) {
      fprintf(stderr,"Warning: --restore-orig-order meaningless without --mark-dups, --dups-only, or --uniq-only.");
    } else {
      fprintf(stderr,"Re-sorting entries by original order...");
      Stopwatch_start(stopwatch);
      qsort(cells,ncells_total,sizeof(T),Cell_lineindex_cmp);
      fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
    }

  } else if (secondary_sort_method == NO_SECONDARY_SORT) {
    fprintf(stderr,"Sorting entries by genomicpos...");
    Stopwatch_start(stopwatch);
    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  } else if (secondary_sort_method == ORIG_SECONDARY_SORT) {
    fprintf(stderr,"Sorting entries by genomicpos and by original file position...");
    Stopwatch_start(stopwatch);
    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_lineindex_cmp);
    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  } else if (secondary_sort_method == ACC_SECONDARY_SORT) {
    fprintf(stderr,"Sorting entries by genomicpos and by accession...");
    Stopwatch_start(stopwatch);
    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_acc_cmp);
    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
    fprintf(stderr,"Sorting entries by genomicpos and by mate fwd...");
    Stopwatch_start(stopwatch);
    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_matefwd_cmp);
    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  } else if (secondary_sort_method == MATEREV_SECONDARY_SORT) {
    fprintf(stderr,"Sorting entries by genomicpos and by mate rev...");
    Stopwatch_start(stopwatch);
    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_materev_cmp);
    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  } else {
    fprintf(stderr,"Secondary sort method not recognized\n");
    abort();
  }

  return;
}


/* Should be a multiple of page size */
#define OUTPUTLEN 16384

static void
print_line_list (FILE *fp_output, List_T *output_list) {
  char buffer[OUTPUTLEN], *ptr;
  char *line, *p;
  int total_linelength, linelength, allowed;
  List_T l;

  *output_list = List_reverse(*output_list);

  total_linelength = 0;
  ptr = &(buffer[0]);
  for (l = *output_list; l != NULL; l = List_next(l)) {
    p = line = (char *) List_head(l);
    linelength = strlen(line);

    while (total_linelength + linelength >= OUTPUTLEN) {
      allowed = OUTPUTLEN - total_linelength;
      strncpy(ptr,p,allowed);
      fwrite(buffer,sizeof(char),OUTPUTLEN,fp_output);

      ptr = &(buffer[0]);
      total_linelength = 0;

      p += allowed;
      linelength -= allowed;
    }
      
    strcpy(ptr,p);
    ptr += linelength;
    total_linelength += linelength;

    FREE(line);
  }

  if (total_linelength > 0) {
    fwrite(buffer,sizeof(char),total_linelength,fp_output);
  }

  List_free(&(*output_list));

  return;
}


static int
print_results (T *cells, int ncells_total, FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
	       Intlist_T linelengths, SAM_split_output_type *split_outputs, unsigned int *flags,
	       Filestring_T headers, bool *duplicatep, int *readindices, bool paired_end_p) {
  int n_duplicate_lines = 0;
  int *ranks, rank;
  int lineindex, i;
  Intlist_T l;
  int linelen;

  int filei;
  FILE *fp_sam, *fp_output;
  size_t fileposition;

  int nblocks, blocki;
  int *blockstarts, *blockends, blockstart, blockend, nlines, linei;
  char **output_lines, *line;
  SAM_split_output_type *output_splits, split_output;

  List_T *output_lists;


  output_lists = (List_T *) CALLOC((1+N_SPLIT_OUTPUTS),sizeof(List_T));

  /* Example:
     If order (cells[i]->lineindex) is 3 1 4 5 9 2 6 8 7, then ranks is 2 6 1 3 4 7 9 8 5.
     Ranks indicate the output position for each input lineindex from 1 to ncells_total */

  fprintf(stderr,"Printing entries in blocks of %u lines...\n",output_nlines);
  Stopwatch_start(stopwatch);

  ranks = (int *) MALLOC(ncells_total*sizeof(int));
  for (i = 0; i < ncells_total; i++) {
    ranks[cells[i]->lineindex] = i;
  }
  
  nblocks = (ncells_total + output_nlines - 1)/output_nlines;
  blockstarts = (int *) MALLOC(nblocks*sizeof(int));
  blockends = (int *) MALLOC(nblocks*sizeof(int));
  for (lineindex = 0, blocki = 0; lineindex + output_nlines < ncells_total;
       lineindex += output_nlines, blocki++) {
    blockstarts[blocki] = lineindex;
    blockends[blocki] = lineindex + output_nlines;
  }
  if (lineindex < ncells_total) {
    blockstarts[blocki] = lineindex;
    blockends[blocki] = ncells_total;
  }

  for (blocki = 0; blocki < nblocks; blocki++) {
    fprintf(stderr,"  Block %d out of %d...",blocki+1,nblocks);
    /* For each block, we process each input file */
    blockstart = blockstarts[blocki];
    blockend = blockends[blocki];
    nlines = blockend - blockstart;
    output_lines = (char **) CALLOC(nlines,sizeof(char *));
    output_splits = (SAM_split_output_type *) MALLOC(nlines*sizeof(SAM_split_output_type));
    
    l = linelengths;
    lineindex = 0;
    for (filei = 0; filei < ninputs; filei++) {
      fp_sam = sam_inputs[filei];
      fileposition = headerlengths[filei];
      
      for (linei = 0; linei < ncells[filei]; linei++) {
	linelen = Intlist_head(l);
	/* printf("Cell %d has rank %d\n",lineindex,ranks[lineindex]); */
	if ((rank = ranks[lineindex]) >= blockstart && rank < blockend) {
	  if (duplicatep != NULL && duplicatep[readindices[lineindex]] == true) {
	    if (print_duplicates_p == false) {
	      output_splits[rank - blockstart] = OUTPUT_NONE;
	    } else {
	      moveto(fp_sam,fileposition);
	      line = (char *) MALLOC((linelen+1)*sizeof(char));
	      fgets(line,linelen+1,fp_sam);
	      if (mark_duplicates_p == false) {
		output_lines[rank - blockstart] = line;
	      } else {
		output_lines[rank - blockstart] = Samread_flag_duplicate(line,linelen,flags[lineindex]);
		FREE(line);
	      }
	      output_splits[rank - blockstart] = split_outputs[lineindex];
	    }
	    n_duplicate_lines++;
	    
	  } else {
	    if (print_unique_p == false) {
	      output_splits[rank - blockstart] = OUTPUT_NONE;
	    } else {
	      moveto(fp_sam,fileposition);
	      line = (char *) MALLOC((linelen+1)*sizeof(char));
	      fgets(line,linelen+1,fp_sam);
	      output_lines[rank - blockstart] = line;
	      output_splits[rank - blockstart] = split_outputs[lineindex];
	    }
	  }
	}

	fileposition += linelen;
	l = Intlist_next(l);
	lineindex++;
      }
    }


    /* Push lines onto lists */
    if (split_output_root == NULL) {
      for (linei = 0; linei < nlines; linei++) {
	if ((line = output_lines[linei]) != NULL) {
	  output_lists[0] = List_push(output_lists[0],(void *) line);
	}
      }

    } else {
      for (linei = 0; linei < nlines; linei++) {
	if ((line = output_lines[linei]) == NULL) {
	  /* Skip */
	} else if ((split_output = output_splits[linei]) == OUTPUT_NONE) {
	  /* Skip */
	  FREE(line);
	} else {
	  output_lists[split_output] = List_push(output_lists[split_output],(void *) line);
	}
      }
    }


    /* Concatenate lists and write */
    if (split_output_root == NULL) {
      if ((fp_output = outputs[0]) == NULL) {
	fp_output = outputs[0] = stdout;
	Filestring_print(fp_output,headers);
      }
      print_line_list(fp_output,&(output_lists[0]));

    } else {
      for (split_output = 0; split_output <= N_SPLIT_OUTPUTS; split_output++) {
	if (output_lists[split_output] != NULL) {
	  if ((fp_output = outputs[split_output]) == NULL) {
	    fp_output = outputs[split_output] = SAM_header_fopen(split_output,split_output_root,paired_end_p,appendp);
	    Filestring_print(fp_output,headers);
	  }
	  print_line_list(fp_output,&(output_lists[split_output]));
	}
      }
    }
    
    FREE(output_splits);
    FREE(output_lines);
    fprintf(stderr,"done\n");
  }

  FREE(blockends);
  FREE(blockstarts);
  FREE(ranks);
  FREE(output_lists);

  fprintf(stderr,"Done with printing (%.1f seconds)\n",Stopwatch_stop(stopwatch));
  if (duplicatep == NULL) {
    return 0;
  } else {
    return n_duplicate_lines;
  }
}



static void
process_no_dedup (FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
		  Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
		  Univcoord_T *chroffsets, Filestring_T headers) {
  T *cells;
  FILE *fp_sam;
  int filei, linei;
  int n_mappers = 0, n_nomappers = 0;
  bool paired_end_p = false;
  Intlist_T l;
  struct T *cells_allocated, *ptr;
  int i, k;

  size_t fileposition;
  char *line, *acc, *hiti;
  int linelen, acclength;
  int lineindex;
  unsigned int *flags, flag;
  int initial_softclip, readlength;
  Hashseq_T hashseq;
  SAM_split_output_type *split_outputs, split_output;
  Univcoord_T genomicpos, mate_genomicpos;
  bool need_mate_genomicpos_p;

  if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
    need_mate_genomicpos_p = true;
  } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
    need_mate_genomicpos_p = true;
  } else {
    need_mate_genomicpos_p = false;
  }

  ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
  cells = (T *) MALLOC(ncells_total * sizeof(T));
  for (i = 0; i < ncells_total; i++) {
    cells[i] = &(ptr[i]);
  }

  split_outputs = (SAM_split_output_type *) MALLOC(ncells_total * sizeof(SAM_split_output_type));
  flags = (unsigned int *) MALLOC(ncells_total * sizeof(unsigned int));

  lineindex = -1;
  fprintf(stderr,"Reading SAM files...\n");

  k = 0;
  l = linelengths;
  for (filei = 0; filei < ninputs; filei++) {
    fprintf(stderr,"  Reading file %d...",filei+1);
    fp_sam = sam_inputs[filei];
    fileposition = headerlengths[filei];
    moveto(fp_sam,fileposition);

    for (linei = 0; linei < ncells[filei]; linei++) {
      linelen = Intlist_head(l);
      line = (char *) MALLOC((linelen+1)*sizeof(char));
      fgets(line,linelen+1,fp_sam); /* Will read linelen characters with '\n' and then '\0' */
      acc = Samread_parse_line(&acclength,&flag,&split_output,&hiti,
			       &genomicpos,&mate_genomicpos,&initial_softclip,&readlength,
			       &hashseq,line,linelen,chromosome_iit,chroffsets,need_mate_genomicpos_p);
      if (flag & PAIRED_READ) {
	paired_end_p = true;
      }
      FREE(hiti);
      FREE(line);

      lineindex++;
      split_outputs[lineindex] = split_output;
      flags[lineindex] = flag;

      if (secondary_sort_method == ACC_SECONDARY_SORT) {
	Cell_fill_no_dedup(cells[k++],lineindex,acc,genomicpos,mate_genomicpos);
      } else {
	FREE(acc);
	Cell_fill_no_dedup(cells[k++],lineindex,/*acc*/(char *) NULL,
			   genomicpos,mate_genomicpos);
      }

      if (flag & QUERY_UNMAPPED) {
	n_nomappers++;
      } else {
	n_mappers++;
      }

      fileposition += linelen;
      l = Intlist_next(l);
    }

    fprintf(stderr,"done\n");
  }

  /* Sort entries, based on genomicpos and secondary criterion */
  sort_cells(cells,ncells_total,restore_original_order_p,
	     secondary_sort_method,/*detect_dups_p*/false);

  if (secondary_sort_method == ACC_SECONDARY_SORT) {
    for (i = 0; i < ncells_total; i++) {
      FREE(cells[i]->acc);
    }
  }

  /* Print */
  print_results(cells,ncells_total,sam_inputs,headerlengths,ncells,ninputs,
		linelengths,split_outputs,flags,headers,
		/*duplicatep*/(bool *) NULL,/*readindices*/(int *) NULL,
		paired_end_p);

  FREE(flags);
  FREE(split_outputs);

  FREE(cells);
  FREE(cells_allocated);

  return;
}


#ifdef LOOKUP_SEQS
static void
Cell_access_queryseqs (T this, FILE **sam_inputs, int *queryseq5_index, int *queryseq3_index,
		       struct T *cells_allocated) {
  FILE *fp_sam;
  unsigned int flag;
  char *read;
  int readlength;
  int mate_allocated;
  T mate;

  fp_sam = sam_inputs[this->filei];
  debug9(printf("Looking for queryseqs for "));
  debug9(Cell_print_fromfile(fp_sam,this,headers));

  if (this->flag & FIRST_READ_P) {
    debug9(printf("Flag for entry %d is %u, indicating a first read\n",k,this->flag));
    if (this->queryseq5 == NULL) {
      moveto(fp_sam,this->linestart);
      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,this->linelen);
      if (this->flag & QUERY_MINUSP) {
	debug9(printf("complementing queryseq5\n"));
	make_complement_inplace(read,readlength);
      }
      this->queryseq5 = read;
      debug9(printf("queryseq5 is %s\n",read));
    }
	    
    if (this->queryseq3 == NULL) {
      mate_allocated = queryseq3_index[this->readindex];
      mate = &(cells_allocated[mate_allocated]);
      debug9(printf("Mate is "));
      debug9(Cell_print_fromfile(fp_sam,mate,headers));
      moveto(fp_sam,mate->linestart);
      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
      if (mate->flag & QUERY_MINUSP) {
	debug9(printf("complementing queryseq3\n"));
	make_complement_inplace(read,readlength);
      }
      this->queryseq3 = read;
      debug9(printf("queryseq3 is %s\n",read));
    }

  } else {
    debug9(printf("Flag for entry %d is %u, indicating a second read\n",k,this->flag));
    if (this->queryseq3 == NULL) {
      moveto(fp_sam,this->linestart);
      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,this->linelen);
      if (this->flag & QUERY_MINUSP) {
	debug9(printf("complementing queryseq3\n"));
	make_complement_inplace(read,readlength);
      }
      this->queryseq3 = read;
      debug9(printf("queryseq3 is %s\n",read));
    }

    if (this->queryseq5 == NULL) {
      mate_allocated = queryseq5_index[this->readindex];
      mate = &(cells_allocated[mate_allocated]);
      debug9(printf("Mate is "));
      debug9(Cell_print_fromfile(fp_sam,mate,headers));
      moveto(fp_sam,mate->linestart);
      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
      if (mate->flag & QUERY_MINUSP) {
	debug9(printf("complementing queryseq5\n"));
	make_complement_inplace(read,readlength);
      }
      this->queryseq5 = read;
      debug9(printf("queryseq5 is %s\n",read));
    }
  }
	  
  Cell_standardize_queryseqs(this);

  return;
}
#endif



static int
read_sam_files_for_hashseq (int *n_mappers, int *n_nomappers, bool *paired_end_p, T *cells,
			    SAM_split_output_type *split_outputs, unsigned int *flags, int *readindices,
			    FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
			    Intlist_T linelengths, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets) {
  int nreads;

  FILE *fp_sam;
  int filei, linei;
#ifdef LOOKUP_SEQS
  int *queryseq5_index, *queryseq3_index;
  T mate;
#endif
  int j, k, lastk;

  size_t fileposition;
  int linelen;
  Univcoord_T genomicpos, mate_genomicpos;
  char *hiti;

  Intlist_T l;
  char *line;
  unsigned int flag;
  SAM_split_output_type split_output;
  int initial_softclip, readlength;
  char *acc, *last_acc;
  int lineindex, readindex;
  int acclength, last_acclength;
  Hashseq_T hashseq, hashseq5, hashseq3;
  bool need_mate_genomicpos_p;

  if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
    need_mate_genomicpos_p = true;
  } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
    need_mate_genomicpos_p = true;
  } else {
    need_mate_genomicpos_p = false;
  }

  *n_mappers = *n_nomappers = 0;
  *paired_end_p = false;

  if (secondary_sort_method == ACC_SECONDARY_SORT) {
    last_acc = (char *) NULL;
  } else {
    last_acc = MALLOC(sizeof(char));
    last_acc[0] = '\0';
  }
  last_acclength = 0;
  lineindex = -1;
  readindex = -1;		/* readindex is 0-based */

  fprintf(stderr,"Reading SAM files for possible duplicates...\n");

  hashseq5 = hashseq3 = 0;
  lastk = k = 0;
  l = linelengths;
  for (filei = 0; filei < ninputs; filei++) {
    fprintf(stderr,"  Reading file %d...",filei+1);
    Stopwatch_start(stopwatch);

    fp_sam = sam_inputs[filei];
    fileposition = headerlengths[filei];
    moveto(fp_sam,fileposition);

    for (linei = 0; linei < ncells[filei]; linei++) {
      linelen = Intlist_head(l);
      line = (char *) MALLOC((linelen+1)*sizeof(char));
      fgets(line,linelen+1,fp_sam); /* Will read linelen characters with '\n' and then '\0' */
      acc = Samread_parse_line(&acclength,&flag,&split_output,&hiti,
			       &genomicpos,&mate_genomicpos,&initial_softclip,&readlength,
			       &hashseq,line,linelen,chromosome_iit,chroffsets,need_mate_genomicpos_p);
      if (flag & PAIRED_READ) {
	*paired_end_p = true;
      }
      FREE(line);

      if (acclength != last_acclength || strcmp(acc,last_acc)) {
	/* Process last acc and readindex */
	for (j = lastk; j < k; j++) {
	  Cell_fill_hashseqs(cells[j],hashseq5,hashseq3);
	}
	hashseq5 = hashseq3 = 0;

	lastk = k;
	readindex++;

      } else {
	/* Same accession */
      }
      if (secondary_sort_method != ACC_SECONDARY_SORT) {
	FREE(last_acc);
      }
      last_acc = acc;
      last_acclength = acclength;

      lineindex++;
      split_outputs[lineindex] = split_output;
      flags[lineindex] = flag;
      readindices[lineindex] = readindex;

      /* debug(printf("Read readindex %d, chrnum %d, chrpos %u, linelen %d\n",readindex,chrnum,chrpos,linelen)); */
      if (flag & NOT_PRIMARY) {
      /* Don't use secondary hit for computing hashseqs */

      } else if (multiple_primaries_p == true) {
	if (strcmp(hiti,"1")) {
	  /* Don't use second or later primary hit for computing hashseqs */
	} else if (flag & FIRST_READ_P) {
	  hashseq5 = hashseq;
	} else {
	  hashseq3 = hashseq;
	}
	
      } else if (flag & FIRST_READ_P) {
	hashseq5 = hashseq;
      } else {
	hashseq3 = hashseq;
      }
      
      FREE(hiti);
      if (secondary_sort_method == ACC_SECONDARY_SORT) {
	Cell_fill_for_dedup(cells[k++],lineindex,readindex,acc,
			    initial_softclip,readlength,genomicpos,mate_genomicpos);
      } else {
	Cell_fill_for_dedup(cells[k++],lineindex,readindex,/*acc*/(char *) NULL,
			    initial_softclip,readlength,genomicpos,mate_genomicpos);
      }

      if (flag & QUERY_UNMAPPED) {
	(*n_nomappers)++;
      } else {
	(*n_mappers)++;
      }

      fileposition += linelen;
      l = Intlist_next(l);
    }

    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
  }

  /* Process last acc and readindex */
  for (j = lastk; j < k; j++) {
    Cell_fill_hashseqs(cells[j],hashseq5,hashseq3);
  }

  if (secondary_sort_method != ACC_SECONDARY_SORT) {
    FREE(last_acc);
  }
  nreads = readindex + 1;
  fprintf(stderr,"Done with reading.  Found %d reads total\n",nreads);

  return nreads;
}


#ifdef VERIFY_HASHSEQ
static void
read_sam_files_for_queryseq (T *orig_cells, int ncells_total,
			     bool *hashseq_duplicates_p, int nreads,
			     unsigned int *flags, int *readindices,
			     FILE **sam_inputs, int *headerlengths, int *ncells,
			     Intlist_T linelengths) {
  bool *fillp, firstp;

  FILE *fp_sam;
  int filei, linei;

  size_t fileposition;
  int linelen;
  char *hiti;

  Intlist_T l;
  char *line;
  unsigned int flag;
  int lineindex, readindex, first_lineindex, k;
  char *queryseq, *queryseq5, *queryseq3;


  fprintf(stderr,"Reading SAM files again to verify duplicates...\n");
  fillp = (bool *) CALLOC(ncells_total,sizeof(bool));

  l = linelengths;
  lineindex = 0;

  filei = 0;
  fp_sam = sam_inputs[filei];
  fileposition = headerlengths[filei];
  moveto(fp_sam,fileposition);
  fprintf(stderr,"  Reading file %d...",filei+1);
  Stopwatch_start(stopwatch);
  linei = 0;

  for (readindex = 0; readindex < nreads; readindex++) {
    debug(printf("readindex %d => hashseq duplicatep %d\n",readindex,hashseq_duplicates_p[readindex]));
    if (hashseq_duplicates_p[readindex] == false) {
      /* Skip this readindex */
      while (lineindex < ncells_total && readindices[lineindex] == readindex) {
	/* printf("lineindex %d, linei %d\n",lineindex,linei); */
	if (linei++ >= ncells[filei]) {
	  fp_sam = sam_inputs[++filei];
	  fileposition = headerlengths[filei];
	  moveto(fp_sam,fileposition);
	  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
	  Stopwatch_start(stopwatch);
	  linei = 0;
	}

	linelen = Intlist_head(l);
	fileposition += linelen;
	l = Intlist_next(l);
	lineindex++;
      }

    } else {
      /* Process this readindex */
      queryseq5 = queryseq3 = (char *) NULL;
      first_lineindex = lineindex;

      moveto(fp_sam,fileposition);
      while (lineindex < ncells_total && readindices[lineindex] == readindex) {
	/* printf("lineindex %d, linei %d\n",lineindex,linei); */
	if (linei++ >= ncells[filei]) {
	  fp_sam = sam_inputs[++filei];
	  fileposition = headerlengths[filei];
	  moveto(fp_sam,fileposition);
	  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
	  Stopwatch_start(stopwatch);
	  linei = 0;
	}
	linelen = Intlist_head(l);

	if ((flag = flags[lineindex]) & NOT_PRIMARY) {
	  /* Don't use secondary hit for obtaining queryseqs */
	  
	} else if (multiple_primaries_p == true) {
	  line = (char *) MALLOC((linelen+1)*sizeof(char));
	  fgets(line,linelen+1,fp_sam); /* Will read linelen characters with '\n' and then '\0' */
	  /* printf("Read line %s\n",line); */
	  queryseq = Samread_parse_read_and_hiti(&hiti,line,linelen,flag);

	  if (strcmp(hiti,"1")) {
	    /* Don't use second or later primary hit for computing hashseqs */
	  } else if (flag & FIRST_READ_P) {
	    fillp[lineindex] = true;
	    queryseq5 = queryseq;
	  } else {
	    fillp[lineindex] = true;
	    queryseq3 = queryseq;
	  }
	  FREE(hiti);
	  FREE(line);
	
	} else if (flag & FIRST_READ_P) {
	  fillp[lineindex] = true;
	  moveto(fp_sam,fileposition);
	  line = (char *) MALLOC((linelen+1)*sizeof(char));
	  fgets(line,linelen+1,fp_sam); /* Will read linelen characters with '\n' and then '\0' */
	  /* printf("Read line %s\n",line); */
	  queryseq5 = Samread_parse_read_and_hiti(&hiti,line,linelen,flag);
	  FREE(line);

	} else {
	  fillp[lineindex] = true;
	  moveto(fp_sam,fileposition);
	  line = (char *) MALLOC((linelen+1)*sizeof(char));
	  fgets(line,linelen+1,fp_sam); /* Will read linelen characters with '\n' and then '\0' */
	  /* printf("Read line %s\n",line); */
	  queryseq3 = Samread_parse_read_and_hiti(&hiti,line,linelen,flag);
	  FREE(line);
	}

	fileposition += linelen;
	l = Intlist_next(l);
	lineindex++;
      }

      /* Note: The same queryseq5 and queryseq3 are shared across two SAM lines, 
	 so firstp marks one copy to delete queryseqs */
      firstp = true;
      for (k = first_lineindex; k < lineindex; k++) {
	Cell_fill_queryseqs(orig_cells[k],queryseq5,queryseq3,firstp);
	firstp = false;
      }
    }
  }

  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));

  FREE(fillp);
  return;
}
#endif


static int
process_with_dedup (int *nduplicates, int *nreads, FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
		    Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
		    Univcoord_T *chroffsets, Filestring_T headers) {
  int n_duplicate_lines;
  int n_mappers, n_nomappers;
  T *cells;
  struct T *cells_allocated, *ptr;
  int i;
#ifdef VERIFY_HASHSEQ
  T *orig_cells;
#endif

  unsigned int *flags;
  SAM_split_output_type *split_outputs;
  int *readindices, readindex, readindex1, readindex2;
  bool *duplicatep, paired_end_p;


  ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
  cells = (T *) MALLOC(ncells_total * sizeof(T));
  for (i = 0; i < ncells_total; i++) {
    cells[i] = &(ptr[i]);
  }
    
  split_outputs = (SAM_split_output_type *) MALLOC(ncells_total * sizeof(SAM_split_output_type));
  flags = (unsigned int *) MALLOC(ncells_total * sizeof(unsigned int));
  readindices = (int *) MALLOC(ncells_total * sizeof(int));


  /* 1.  First read of SAM files to fill cells with hashseq */
  *nreads = read_sam_files_for_hashseq(&n_mappers,&n_nomappers,&paired_end_p,
				       cells,split_outputs,flags,readindices,
				       sam_inputs,headerlengths,ncells,ninputs,linelengths,
				       chromosome_iit,chroffsets);


#ifdef VERIFY_HASHSEQ
  /* 2.  Save original cell order before the sort */
  orig_cells = (T *) MALLOC(ncells_total*sizeof(T));
  memcpy(orig_cells,cells,ncells_total*sizeof(T));
#endif
  

  /* 3.  Sort entries, based on highpos and by hashseq.  This should also put nomappers at the end. */
  Stopwatch_start(stopwatch);
  fprintf(stderr,"Sorting SAM lines based on hash values...");
  qsort(cells,ncells_total,sizeof(T),Cell_highpos_hashseq_cmp);
  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));


  /* 4.  Find duplicates based on hashseq */
  duplicatep = (bool *) CALLOC(*nreads,sizeof(bool));

  Stopwatch_start(stopwatch);
  fprintf(stderr,"Finding duplicates based on hashseq...");

  /* 4a.  Mapped reads */
  for (i = 1; i < n_mappers; i++) {
    if ((readindex1 = cells[i-1]->readindex) == (readindex2 = cells[i]->readindex)) {
      /* Skip */
    } else if (Cell_highpos_hashseq_cmp(&(cells[i]),&(cells[i-1])) == 0) {
      duplicatep[readindex2] = true;
#ifdef VERIFY_HASHSEQ
      /* Mark all since we will verify later */
      duplicatep[readindex1] = true;
#else
      if (mark_first_p == true) {
	duplicatep[readindex1] = true;
      }
#endif
    }
  }

  /* 4b.  Non-mapped reads */
  for (i = n_mappers + 1; i < ncells_total; i++) {
    if ((readindex1 = cells[i-1]->readindex) == (readindex2 = cells[i]->readindex)) {
      /* Skip */
    } else if (Cell_hashseq_cmp(&(cells[i]),&(cells[i-1])) == 0) {
      duplicatep[readindex2] = true;
#ifdef VERIFY_HASHSEQ
      /* Mark all since we will verify later */
      duplicatep[readindex1] = true;
#else
      if (mark_first_p == true) {
	duplicatep[readindex1] = true;
      }
#endif
    }
  }

  *nduplicates = 0;
  for (readindex = 0; readindex < *nreads; readindex++) {
    if (duplicatep[readindex] == true) {
      (*nduplicates)++;
    }
  }

#ifdef VERIFY_HASHSEQ
  fprintf(stderr,"done (%.1f seconds).  Found %d reads as duplicates or first occurrences by hashseq\n",
	  Stopwatch_stop(stopwatch),*nduplicates);
#else
  fprintf(stderr,"done (%.1f seconds).  Found %d reads as duplicates by hashseq\n",
	  Stopwatch_stop(stopwatch),*nduplicates);
#endif


#ifdef VERIFY_HASHSEQ
  /* 5.  Second read of SAM files to fill possible duplicate cells with queryseq */
  read_sam_files_for_queryseq(orig_cells,ncells_total,duplicatep,*nreads,flags,readindices,
			      sam_inputs,headerlengths,ncells,linelengths);

  /* 6.  Mark all duplicates, based on queryseq */
  fprintf(stderr,"Verifying duplicates, based on reads...");
  Stopwatch_start(stopwatch);
  memset(duplicatep,false,(*nreads)*sizeof(bool));

  /* 6a.  Mapped reads */
  qsort(&(cells[0]),n_mappers,sizeof(T),Cell_highpos_queryseq_cmp);
  for (i = 1; i < n_mappers; i++) {
    if (cells[i]->fillp == false) {
      /* Skip */
    } else if (cells[i-1]->fillp == false) {
      /* Skip */
    } else if ((readindex1 = cells[i-1]->readindex) == (readindex2 = cells[i]->readindex)) {
      /* Skip */
    } else if (Cell_highpos_queryseq_cmp(&(cells[i]),&(cells[i-1])) == 0) {
      duplicatep[readindex2] = true;
      if (mark_first_p == true) {
	duplicatep[readindex1] = true;
      }
    }
  }

  /* 6b.  Non-mapped reads */
  qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_queryseq_cmp);
  for (i = n_mappers + 1; i < ncells_total; i++) {
    if (cells[i]->fillp == false) {
      /* Skip */
    } else if (cells[i-1]->fillp == false) {
      /* Skip */
    } else if ((readindex1 = cells[i-1]->readindex) == (readindex2 = cells[i]->readindex)) {
      /* Skip */
    } else if (Cell_queryseq_cmp(&(cells[i]),&(cells[i-1])) == 0) {
      duplicatep[readindex2] = true;
      if (mark_first_p == true) {
	duplicatep[readindex1] = true;
      }
    }
  }

  for (i = 0; i < ncells_total; i++) {
    if (orig_cells[i]->firstp == true) {
      FREE(orig_cells[i]->queryseq_alpha);
      FREE(orig_cells[i]->queryseq_omega);
    }
  }
  FREE(orig_cells);

  *nduplicates = 0;
  for (readindex = 0; readindex < *nreads; readindex++) {
    if (duplicatep[readindex] == true) {
      (*nduplicates)++;
    }
  }

  fprintf(stderr,"done (%.1f seconds).  Found %d reads as duplicates by reads\n",
	  Stopwatch_stop(stopwatch),*nduplicates);
#endif


#if 0
  /* Not sure why this was done.  No further sorting is performed */
  for (i = n_mappers; i < ncells_total; i++) {
    if (duplicatep[cells[i]->readindex] == true) {
      cells[i]->hashseq_min = cells[i]->hashseq_max = 0; /* Will be sorted to end of list */
    }
  }
#endif


  /* 7.  Re-sort entries, based on genomicpos (rather than high), and secondary criterion */
  sort_cells(cells,ncells_total,restore_original_order_p,
	     secondary_sort_method,/*detect_dups_p*/true);

  if (secondary_sort_method == ACC_SECONDARY_SORT) {
    for (i = 0; i < ncells_total; i++) {
      FREE(cells[i]->acc);
    }
  }

  /* 8.  Print */
  n_duplicate_lines = print_results(cells,ncells_total,sam_inputs,headerlengths,ncells,ninputs,
				    linelengths,split_outputs,flags,headers,
				    duplicatep,readindices,paired_end_p);

  FREE(duplicatep);

  FREE(readindices);
  FREE(flags);
  FREE(split_outputs);

  FREE(cells);
  FREE(cells_allocated);

  return n_duplicate_lines;
}



static char *
check_valid_int (char *string) {
  char *p = string;

  if (*p == '+' || *p == '-') {
    p++;
  }

  if (!isdigit(*p)) {
    fprintf(stderr,"value %s is not a valid int\n",string);
    exit(9);
    return NULL;
  }
  while (*p != '\0' && isdigit(*p)) {
    p++;
  }

  if (*p == 'e') {
    p++;
    if (*p == '+') {
      p++;
    }
    if (!isdigit(*p)) {
      return false;
    }
    while (*p != '\0' && isdigit(*p)) {
      p++;
    }
  }

  if (*p == '\0') {
    return string;
  } else {
    fprintf(stderr,"value %s is not a valid int\n",string);
    exit(9);
    return NULL;
  }
}



#define BUFFERLEN 1024

int
main (int argc, char *argv[]) {
  FILE **sam_inputs, *fp_sam;
  int ninputs, filei;
  int nchromosomes, i;
  bool *circularp;
  Univcoord_T *chroffsets;
  Chrpos_T *chrlengths;
  size_t fileposition;
  int lastchar;

  char buffer[BUFFERLEN], *lastp, *p;
  Intlist_T linelengths;
  int *headerlengths, linelen;
  Filestring_T headers = NULL;
  SAM_split_output_type split_output;
#ifdef DEBUG14
  Intlist_T linelengths_goldstd;
  int linelen_goldstd;
#endif

  char *fileroot = NULL, *iitfile;
  Univ_IIT_T chromosome_iit;	/* TODO: Generate a chromosome_iit from SAM headers */
  int *ncells, ncells_total, n_duplicate_lines;
  int nduplicates, nreads;

  int opt;
  extern int optind;
  extern char *optarg;
  int long_option_index = 0;
  const char *long_name;

  while ((opt = getopt_long(argc,argv,"D:d:^?",
			    long_options,&long_option_index)) != -1) {
    switch (opt) {
    case 0:
      long_name = long_options[long_option_index].name;
      if (!strcmp(long_name,"version")) {
	print_program_version();
	exit(0);
      } else if (!strcmp(long_name,"help")) {
	print_program_usage();
	exit(0);

      } else if (!strcmp(long_name,"split-output")) {
	split_output_root = optarg;
      } else if (!strcmp(long_name,"append-output")) {
	appendp = true;

      } else if (!strcmp(long_name,"sort2")) {
	if (!strcmp(optarg,"none")) {
	  secondary_sort_method = NO_SECONDARY_SORT;
	} else if (!strcmp(optarg,"orig")) {
	  secondary_sort_method = ORIG_SECONDARY_SORT;
	} else if (!strcmp(optarg,"accession")) {
	  secondary_sort_method = ACC_SECONDARY_SORT;
	} else if (!strcmp(optarg,"mate-fwd")) {
	  secondary_sort_method = MATEFWD_SECONDARY_SORT;
	} else if (!strcmp(optarg,"mate-rev")) {
	  secondary_sort_method = MATEREV_SECONDARY_SORT;
	} else {
	  fprintf(stderr,"--sort2 must be none, orig, accession, mate-fwd, or mate-rev\n");
	  exit(9);
	}

      } else if (!strcmp(long_name,"mark-dups")) {
	detect_duplicates_p = true;
	mark_duplicates_p = true;
	print_unique_p = true;
	print_duplicates_p = true;

      } else if (!strcmp(long_name,"mark-first")) {
	mark_first_p = true;

      } else if (!strcmp(long_name,"dups-only")) {
	detect_duplicates_p = true;
	print_unique_p = false;
	print_duplicates_p = true;

      } else if (!strcmp(long_name,"uniq-only")) {
	detect_duplicates_p = true;
	print_unique_p = true;
	print_duplicates_p = false;

      } else if (!strcmp(long_name,"restore-orig-order")) {
	restore_original_order_p = true;

      } else if (!strcmp(long_name,"multiple-primaries")) {
	multiple_primaries_p = true;

      } else if (!strcmp(long_name,"no-sam-headers")) {
	sam_headers_p = false;
	
      } else if (!strcmp(long_name,"output-buffer")) {
	output_nlines = atoi(check_valid_int(optarg));
	
      } else {
	/* Shouldn't reach here */
	fprintf(stderr,"Don't recognize option %s.  For usage, run 'get-genome --help'",long_name);
	exit(9);
      }
      break;

    case 'D': user_genomedir = optarg; break;
    case 'd': 
      dbroot = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
      strcpy(dbroot,optarg);
      break;

    case '^': print_program_version(); exit(0);
    case '?': print_program_usage(); exit(0);
    default: exit(9);
    }
  }
  argc -= optind;
  argv += optind;

  if (dbroot == NULL) {
    print_program_usage();
    exit(9);
  } else if (!strcmp(dbroot,"?")) {
    Datadir_avail_gmap_databases(stdout,user_genomedir);
    exit(0);
  } else {
    genomesubdir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_genomedir,dbroot);
    iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
			      strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
    sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
    chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false);
    FREE(iitfile);

    FREE(dbversion);
    FREE(genomesubdir);
    FREE(fileroot);
    FREE(dbroot);

    nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
    circularp = Univ_IIT_circularp(&any_circular_p,chromosome_iit);
    FREE(circularp);

    chrlengths = Univ_IIT_chrlengths(chromosome_iit);
    chroffsets = MALLOC(nchromosomes * sizeof(Univcoord_T));
    chroffsets[0] = 0;
    for (i = 1; i < nchromosomes; i++) {
      chroffsets[i] = chroffsets[i-1] + chrlengths[i-1];
    }
    FREE(chrlengths);
  }
    
  /* Open all outputs, even if --split-output is not used */
  outputs = (FILE **) CALLOC((1+N_SPLIT_OUTPUTS),sizeof(FILE *));

  /* Inputs */
  ninputs = argc;
  sam_inputs = (FILE **) CALLOC(ninputs,sizeof(FILE *));
  headerlengths = (int *) CALLOC(ninputs,sizeof(int));
  ncells = (int *) CALLOC(ninputs,sizeof(int));
  for (filei = 0; filei < ninputs; filei++) {
    if ((sam_inputs[filei] = fopen(argv[filei],"r")) == NULL) {
      fprintf(stderr,"Cannot open SAM file %s\n",argv[filei]);
      exit(9);
    }
  }

  stopwatch = Stopwatch_new();
  Stopwatch_start(stopwatch);
  fprintf(stderr,"Analyzing %d SAM files...\n",ninputs);

  linelengths = (Intlist_T) NULL;
  ncells_total = 0;
  for (filei = 0; filei < ninputs; filei++) {
    fp_sam = sam_inputs[filei];
    fileposition = headerlengths[filei] = SAM_header_length(&lastchar,fp_sam); /* Ignore lastchar */

    /* Take care of char read by SAM_header_length */
#ifdef HAVE_FSEEKO
    fseeko(fp_sam,-1,SEEK_CUR);
#else
    fseek(fp_sam,-1,SEEK_CUR);
#endif

    linelen = 0;
    ncells[filei] = 0;
    while (fgets(buffer,BUFFERLEN,fp_sam) != NULL) {
      /* printf("Read %s\n",buffer); */
      lastp = buffer;
      while ((p = index(lastp,'\n')) != NULL) {
	linelen += (p - lastp)/sizeof(char) + 1;

	linelengths = Intlist_push(linelengths,linelen);
	fileposition += linelen;
	ncells[filei] += 1;

	linelen = 0;
	lastp = p + 1;
      }
      linelen += strlen(lastp);
      /* printf("Adding %d to get linelen %d\n",strlen(buffer),linelen); */
    }

    ncells_total += ncells[filei];

    if (fileposition != Access_filesize(argv[filei])) {
      fprintf(stderr,"Something is wrong with parsing of SAM file %s\n",argv[filei]);
      fprintf(stderr,"Initial file position using sortinfo: %llu\n",(unsigned long long) fileposition);
      fprintf(stderr,"File size of SAM output file:       %llu\n",(unsigned long long) Access_filesize(argv[0]));
      exit(9);
    } else {
      fprintf(stderr,"  File %d has %d SAM lines.\n",filei+1,ncells[filei]);
    }

  }

  fprintf(stderr,"Done with analysis (%.1f seconds).  Found %d SAM lines total.\n",
	  Stopwatch_stop(stopwatch),ncells_total);

  if (ncells_total == 0) {
    /* Exit without printing header */

  } else if (sam_headers_p == false) {
    /* Don't print SAM headers */

  } else {
    moveto(sam_inputs[0],0);
    headers = SAM_header_change_HD_tosorted(sam_inputs[0],headerlengths[0]);
  }

  linelengths = Intlist_reverse(linelengths);

  if (detect_duplicates_p == false) {
    process_no_dedup(sam_inputs,headerlengths,ncells,ninputs,linelengths,ncells_total,
		     chromosome_iit,chroffsets,headers);
  } else {
    n_duplicate_lines = process_with_dedup(&nduplicates,&nreads,sam_inputs,headerlengths,ncells,ninputs,
					   linelengths,ncells_total,chromosome_iit,chroffsets,headers);
    if (print_duplicates_p == false) {
      fprintf(stderr,"Removed %d out of %d SAM lines as duplicates (%.1f%%)\n",
	      n_duplicate_lines,ncells_total,100.0*(double) n_duplicate_lines/(double) ncells_total);
      fprintf(stderr,"Removed %d out of %d SAM reads as duplicates (%.1f%%)\n",
	      nduplicates,nreads,100.0*(double) nduplicates/(double) nreads);
    } else if (mark_duplicates_p == true) {
      fprintf(stderr,"Marked %d out of %d SAM lines as duplicates (%.1f%%)\n",
	      n_duplicate_lines,ncells_total,100.0*(double) n_duplicate_lines/(double) ncells_total);
      fprintf(stderr,"Marked %d out of %d SAM reads as duplicates (%.1f%%)\n",
	      nduplicates,nreads,100.0*(double) nduplicates/(double) nreads);
    } else {
      fprintf(stderr,"Found %d out of %d SAM lines as duplicates (%.1f%%)\n",
	      n_duplicate_lines,ncells_total,100.0*(double) n_duplicate_lines/(double) ncells_total);
      fprintf(stderr,"Found %d out of %d SAM lines as duplicates (%.1f%%)\n",
	      nduplicates,nreads,100.0*(double) nduplicates/(double) nreads);
    }
  }

  for (filei = 0; filei < ninputs; filei++) {
    fclose(sam_inputs[filei]);
  }
  FREE(sam_inputs);
  FREE(headerlengths);
  FREE(ncells);

  Stopwatch_free(&stopwatch);

  if (split_output_root != NULL) {
    touch_all_files(outputs,split_output_root,headers,appendp);

    for (split_output = 1; split_output <= N_SPLIT_OUTPUTS; split_output++) {
      if (outputs[split_output] != NULL) {
	fclose(outputs[split_output]);
      }
    }
  } else {
    /* Wrote to stdout */
  }
  FREE(outputs);

  if (headers != NULL) {
    Filestring_free(&headers,/*free_string_p*/true);
  }

  Intlist_free(&linelengths);

  FREE(chroffsets);
  Univ_IIT_free(&chromosome_iit);

  return 0;
}
