#!/usr/bin/env python3

import io
import sys
import os
import re
import json

import argparse

# parse command line
argparser = argparse.ArgumentParser(description="Uses bcl2fastq's demultiplexing stats as metadata to organise samples")
argparser.add_argument('-S', '--statsdir', metavar='DIR', required=True,
	type=str, dest='statsdir', help="directory containing 'Stats/Stats.json'")
argparser.add_argument('-f', '--fastqdir', metavar='DIR', required=False, default=None,
	type=str, dest='fastqdir', help="directory containing .fastq.gz files if different from above")
argparser.add_argument('-q', '--qcdir', metavar='DIR', required=False, default=None,
	type=str, dest='qcdir', help="if set, import FastQC's _fastqc.html files from there")
argparser.add_argument('-o', '--outdir', metavar='DIR', required=False, default='sampleset',
	type=str, dest='outdir', help="output directory")
argparser.add_argument('-m', '--mode', metavar='MODE', required=False,
	type=str, dest='mode', help="POSIX file access mode to be passed to mkdir")
argparser.add_argument('-L', '--linking', metavar='CPLINK', required=False, default='--link',
	type=str, dest='link', choices=['','--link','--symbolic-link','--reflink'],
	help="parameter to pass to `cp` for linking files instead of copying their data")
argparser.add_argument('--force', required=False,
	action='store_true', dest='force', help="Force overwriting any existing file when moving")
argparser.add_argument('-s', '--summary', required=False,
	action='store_true', dest='summary', help="Only display a summary of datasets, not an exhaustive list of all samples")
argparser.add_argument('-a', '--append', required=False,
	action='store_true', dest='append', help="Append to the end of movedatafiles.sh, instead of overwritting\n(use when calling from an external combiner wrapper)")
argparser.add_argument('-g', '--staging', required=False,
	action='store_true', dest='staging', help="Write samples list in .tsv.staging and only rename them to the final .tsv at the end of movedatafiles.sh if there were no errors.")
argparser.add_argument('-n', '--noempty', required=False,
	action='store_true', dest='noempty', help="skip fastq.gz files with bad yield (0 reads)")
argparser.add_argument('-p', '--patchmap', metavar='TSV', required=False, default=None,
	type=str, dest='patchtsv', help="patchmap file to rename samples")
args = argparser.parse_args()

statsdir=args.statsdir
fastqdir=args.fastqdir if args.fastqdir else statsdir
qcdir=args.qcdir
sampleset=args.outdir
link=args.link
append=args.append
noempty=args.noempty
staging_suffix='.staging' if args.staging else ''

statsjson=os.path.join(statsdir, 'Stats/Stats.json')

# parse the chmod parameter
try:
	mkdirmode=int(args.mode, base=8) if args.mode else None
except:
	print(f"cannot parse <{args.mode}> as an octal chmod value. see `mkdir --help` for informations")
	sys.exit(2)

# regex for parsing and/or validating strings
rxrun=re.compile('^(?P<century>20)?(?P<date>\d{6})_(?P<instr>\w+)_(?P<num>\d+)_(?P<prefix>(?:0+-)|[AB])?(?P<cell>\w+(?(prefix)|-\d+))$') # e.g.: 200430_M01761_0414_000000000-J3JCT or 201023_A00730_0259_BHTVCCDRXX or 20210709_FS10001154_41_BPA73113-1417
rxcell=re.compile('(?:0+-)?(?P<cell>\w+(?:-\d+)?)$') # e.g.: '000000000-CTT3D' or 'HTVCCDRXX' or 'BPA73113-1417'

# patch file exist ?
patchmap = { }
if args.patchtsv:
	with open(args.patchtsv,'rt',encoding='utf-8', newline='') as pf:
		patchmap = { old:new for (old,new,*r) in csv.reader(pf, delimiter="\t") }


with open(statsjson, 'rt') as f:
	stats = json.loads(f.read());

# parse flowcell
try:
	m=rxcell.search(stats['Flowcell']).groupdict()
	flowcell=m['cell']
except:
	sys.exit(f"cannot parse: {stats['Flowcell']}")

# parse run folder
runfolder=stats['RunId']
try:
	m=rxrun.search(runfolder).groupdict()
	rundate=f"20{m['date']}" # NOTE runfolders are yymmdd, not yyyymmdd
	if flowcell != m['cell']:
		print(f"Warning: cell missmatch: {flowcell} vs {m['cell']}")
except:
	sys.exit(f"cannot parse: {runfolder}")

print(runfolder, flowcell, rundate, sep='\t')

# parse information about reads
lane={}
for l in stats['ReadInfosForLanes']: # lane
	lanenum=l['LaneNumber']
	ends=rlen=0
	for r in l['ReadInfos']: # read phases (indexes, reads)
		if r['IsIndexedRead']: continue

		# sanity check
		if rlen and rlen != r['NumCycles']:
			print(f"Warning: read lenght changes from {rlen} to {r['NumCycles']} we currently only support symetric read lenghts")

		# gather info
		ends+=1
		if rlen < r['NumCycles']: rlen=r['NumCycles']

	# sanity check
	if ends < 1 or ends > 2:
		print(f"Error: we currently only support single or paired ends, but found {ends} reads")

	lane[lanenum]={'ends': ends, 'rlen': rlen-1}

# create sampleset directory if missing
if not os.path.isdir(sampleset):
	try:
		kwmkdir={ 'mode': mkdirmode } if mkdirmode else { }
		os.mkdir(sampleset,**kwmkdir)
	except FileExistsError:
		pass

# output files
batch=f"{rundate}_{flowcell}"
tsv=open(os.path.join(sampleset,f'samples.{batch}.tsv{staging_suffix}'), 'wt')
# shell script file with all moving instructions inside
sh=open(os.path.join(sampleset,'movedatafiles.sh'), 'at' if append else 'wt')

# generic header: only for stand-alone files.
if not append: print(r'''
link='%(link)s'
mode='%(mode)s' # e.g.: --mode=0770

# Helper
fail() {
	printf '\e[31;1mArgh: %%s\e[0m\n'	"$1"	1>&2
	[[ -n "$2" ]] && echo "$2" 1>&2
	exit 1
}

warn() {
	printf '\e[33;1mArgh: %%s\e[0m\n'	"$1"	1>&2
	[[ -n "$2" ]] && echo "$2" 1>&2
}

ALLOK=1
X() {
	ALLOK=0
}

# sanity checks
[[ -d '%(sampleset)s' ]] || fail 'No sampleset directory:' '%(sampleset)s'
''' % {'link':link,'mode':(f"--mode={mkdirmode:04o}" if mkdirmode else ''),'sampleset':sampleset}, file=sh)

# per batch directory checks
print(r"[[ -d '%(download)s' ]] || fail 'No download directory:' '%(download)s'" % {'download':fastqdir}, file=sh)
if qcdir:
	print(r"[[ -d '%(qc)s' ]] || fail 'No download directory:' '%(qc)s'" % {'qc': qcdir}, file=sh)


# parse info about samples
for l in stats['ConversionResults']: # lane
	lanenum=l['LaneNumber']
	ends=lane[lanenum]['ends']
	rlen=lane[lanenum]['rlen']

	for s in l['DemuxResults']: # sample in lane
		samname=s['SampleName']
		fixedname=patchmap[samname] if samname in patchmap else samname

		# filter out fastq files with zero reads
		if noempty and s['NumberReads'] == 0:
			print(samname, "\x1b[33;1mBad yield !\x1b[0m", sep='\t')
			continue

		# info
		if not args.summary:
			print(fixedname, f"{'paired' if ends > 1 else 'single'}-end", rlen, sep='\t')
		# output files
		if tsv:
			print(fixedname, batch, rlen, sep='\t', file=tsv)
		if sh:
			subst={'force':('f' if args.force else ''),'download':fastqdir,'sampleset':sampleset,'sname':samname,'fname':fixedname,'ends':ends,'batch':batch,'suffix':''}
			# 'midfix': _S*_L%(lane)03u
			# 'suffix': _001_MM_1 mm
			print(r'''
fastq=( %(download)s/%(sname)s*_R[1-2]*%(suffix)s.fastq.gz )
[[ "${fastq[*]}" =~ [\*\[] ]] && fail 'Cannot list fastq files:' '%(sname)s'
(( ${#fastq[@]} != %(ends)u )) && fail 'Number of fastq files not %(ends)u' "${#fastq[@]} : ${fastq[*]}"
mkdir ${mode} -p "%(sampleset)s/"{,"%(fname)s/"{,"%(batch)s/"{,raw_data,extracted_data}}}
for file in "${fastq[@]}"; do
	filename="${file##*/}"
	[[ $file =~ _(R[[:digit:]])((_.*)?[.]fastq[.]gz)$ ]] && destname="${filename//${BASH_REMATCH[2]}/.fastq.gz}"
	cp -v%(force)s ${link} "${file}" "%(sampleset)s/%(fname)s/%(batch)s/raw_data/${destname}"||X
''' % subst, file=sh)
			if qcdir:
				subst['fastqc']=qcdir
				print(r'''
	fqcname="${filename//%(suffix)s.fastq.gz/_fastqc.html}"
	cp -v%(force)s ${link} "%(fastqc)s/${fqcname}" "%(sampleset)s/%(fname)s/%(batch)s/extracted_data/${BASH_REMATCH[1]}_fastqc.html"||X
''' % subst, file=sh)
			print('done', file=sh)


# coda: rename staging and return status
if args.staging:
	print(f"(( ALLOK )) && mv -v {sampleset}/samples.{batch}.tsv{staging_suffix} {sampleset}/samples.{batch}.tsv", file=sh)

if not append: print("""
if (( ! ALLOK )); then
	echo Some errors
	exit 1
fi;

echo All Ok
exit 0
""", file=sh)
