#!/bin/bash


# Helper
ALLOK=1
X() {
	ALLOK=0
}
fail() {
	printf '\e[31;1mArgh: %s\e[0m\n'	"$1"	1>&2
	[[ -n "$2" ]] && echo "$2" 1>&2
	exit 1
}
warn() {
	printf '\e[33;1mOops: %s\e[0m\n'	"$1"	1>&2
	[[ -n "$2" ]] && echo "$2" 1>&2
	X
}
info() {
	printf '\e[36;1mInfo: %s\e[0m\n'	"$1"	1>&2
	[[ -n "$2" ]] && echo "$2" 1>&2
}

usage() { echo "Usage: $0 -f <DIR> -b <BATCH> [-l <LEN>] [-L {''|--link|--symbolic-link|--reflink}] -o <DIR> -m <MODE>
	-f : directory containing .fastq.gz files
	-b : batch name to use for 2nd level (e.g.: date)
	-l : read lenght (default: autodetect)
	-L : link parameter to pass to cp when copying (default: --link)
	-t : tsv file (default: samples.<BATCH>.tsv)
	-T : do not truncate (empty) the file before starting
	-g : store list in .tsv.staging instead and only rename into final .tsv if successful
	-D : sample have duplicates (e.g.: across lanes)
	-p : prefix to prepend to fastq files (e.g.: for fusing runs)
	-s : suffix to append to fastq files (e.g.: for fusing runs)
	-o : output directory
	-m : POSIX mode parameter to pass to mkdir (e.g.: 0770)" 1>&2; exit "$1"; }
# TODO support FastQC files

link=--link
read_len=
truncate=1
prefix=''
suffix=''
quiet=0
duplicates=0
staging=0
mode=
while getopts "f:b:Dl:L:p:s:o:m:t:Tgqh" o; do
	case "${o}" in
		f)	fastq_dir="${OPTARG}"
			# shellcheck disable=SC2206
			l=( ${fastq_dir} )	# HACK: no quotes -> expand glob.
			[[ -d "${l[0]}" ]]	|| fail "Cannot access input directory ${fastq_dir}"
			;;
		b)	batch_name="${OPTARG}"
			[[ $batch_name =~ ^[[:alnum:]_]+$ ]]	|| fail "Invalid characters <${batch_name//[[:alnum:]_]/}> in <${batch_name}>" 'Please only use alphanumeric and <_> for batch name'
			;;
		D)	duplicates=1	;;
		l)	read_len="${OPTARG}"
			[[ $read_len =~ ^[[:digit:]]+$ ]]	|| fail  "Non-digit characters <${read_len//[[:digit:]]/}> in <${read_len}>" 'Please only use numbers for read lean'
			;;
		L)	link="${OPTARG}"
			[[ -z "${OPTARG}" || "${OPTARG}" =~ ^(--link|--symbolic-link|--reflink)$ ]]	|| fail "Invalid link parameter <${OPTARG}>"
			;;
		p)	prefix="${OPTARG}"	;;
		s)	suffix="${OPTARG}"	;;
		o)	out_dir="${OPTARG}"	;;
		m)	mode="${OPTARG}"
			[[ $mode =~ ^[0-7]{,4}$ ]]	|| fail "Invalid characters <${mode//[0-7]/}> in <${mode}>" 'mode should be an octal chmod value, see <mkdir --help> for informations'
			;;
		t)	tsv="${OPTARG}"	;;
		g)	staging=1	;;
		T)	truncate=0	;;
		q)	quiet=1	;;
		h)	usage 0	;;
		*)	usage 1	;;
	esac
done

: "${fastq_dir:?missing mandatory input directory for fastq files, use option -f}"
: "${batch_name:?missing mandatory batch name, use option -b}"
: "${out_dir:?missing mandatory output dir use option -o}"

: "${tsv:=${out_dir}/samples.${batch_name}.tsv}"

if (( staging )); then
	staging_suffix=".staging"
	tsv="${tsv}${staging_suffix}"
else
	staging_suffix=
fi

# RegEx

rxextentsion='\.((fastq|fq)(\.gz)?|fgz|fqz)$'
rxpairsuffix="_(R[[:digit:]])(_[[:digit:]]+)?(_MM_[[:digit:]])?${rxextentsion}"
rxlane='_L[[:digit:]]+'
rxsnum='_S[[:digit:]]+'
rxname="(.*)(${rxsnum})?(${rxlane})?${rxpairsuffix}"
# e.g.: .../320194_1637_B5_S200_L002_R1_001.fastq.gz or BSSE_QGF_148702_000000000_CW477_1_H2O_CP_030_B06_CCAGTTAG_CCTGTCAT_S42_L001_R2_001_MM_1.fastq.gz

# OS dependent part
if [[ "$OSTYPE" == darwin* ]]; then
	# BSD-style find
	bsdextregex='-E'
else
	# GNU-style find
	gnuextregex='-regextype posix-extended'
fi
if declare -A reads; then
	# newer bash 4.x and above on most GNU/Linux
	addread() {
		reads[${1}]="${reads[${1}]}:$2"
	}
	testnumreads() {
		if [[ ${reads[${1}]} =~ $2 ]]; then
			return 1
		else
			reads[${1}]="${reads[${1}]}:$2"
			return 0
		fi
	}
	seen() {
		[[ -n ${reads[${1}]} ]]
	}
else
	# older bash 3.x on Mac OS X and 11
	declare -a reads1
	declare -a reads2
	addread() {
		reads1+=( "$1" )
		reads2+=( "$1:$2" )
	}
	testnumreads() {
		for e in "${reads2[@]}"; do
			if [[ "$e" == "$1:$2" ]]; then
				return 1
			fi;
		done
		reads1+=( "$1" )
		reads2+=( "$1:$2" )
		retrun 0
	}
	seen() {
		for e in "${reads1[@]}"; do [[ "$e" == "$1" ]] && return 0; done
		return 1
	}
fi

# files
mkdir -p ${mode:+-m ${mode}} "${out_dir}/" || fail "cannot create output directory <${out_dir}>"
if (( truncate )); then
	echo -n '' > "${tsv}"
fi

# scan for fastq files
paired=0
numsam=0
numdup=0
while read -r -d $'\0' f; do
	# sanity check
	if [[ ! -s "${f}" ]]; then
		warn "Cannot access <${f}>"
		continue
	fi
	if [[ "${f##*/}" =~ ^\._ ]]; then
		# skip Mac OS X resource forks
		continue
	fi

	# extract name
	if [[ ! "${f##*/}" =~ ^${rxname} ]]; then
		warn "Cannot parse <${f##*/}>"
		continue
	fi

	samname="${BASH_REMATCH[1]}"
	rawname="${samname}"
	[[ "${samname}" =~ ^(.*)${rxlane}$ ]] && samname="${BASH_REMATCH[1]}"
	[[ "${samname}" =~ ^(.*)${rxsnum}$ ]] && samname="${BASH_REMATCH[1]}"
	[[ "${f##*/}" =~ ${rxpairsuffix} ]] && read="${BASH_REMATCH[1]}"
	[[ "${f##*/}" =~ ${rxextentsion} ]] && ext="${BASH_REMATCH[1]}"
	(( quiet )) || echo -ne "$samname\t$read\t"

	# check seen
	if seen "${samname}"; then
		# check number of reads
		if testnumreads "${samname}" "$read"; then
			(( quiet )) || echo "pair"
			(( paired ++ ))
		elif (( duplicates )); then
			(( quiet )) || echo "(duplicate $read)"
			(( numdup ++ ))
		else
			warn "sample ${samname} already has a read $read"
		fi
	else
		addread "${samname}" "$read"
		if [[ -z ${read_len} ]]; then
			# autodetect lenght
			rlen=$( ( ([[ $ext =~ z$ ]] && gunzip -c "${f}") || cat  "${f}") |	\
				awk 'NR%4==2{l=length($0);if(m<l){m=l}else{++n}};n==100{exit(0)};END{print m}')
		else
			rlen=${read_len}
		fi
		(( quiet )) || echo "${rlen}"
		echo -e "${samname}\t${batch_name}\t${rlen}" >> "${tsv}"
		(( numsam++ ))
	fi

	mkdir -p ${mode:+-m ${mode}} "${out_dir}/"{,"${samname}/"{,"${batch_name}/"{,raw_data,extracted_data}}}
	# shellcheck disable=SC2086
	cp ${link} "${f}" "${out_dir}/${samname}/${batch_name}/raw_data/${prefix}${rawname}${suffix}_${read}.${ext}"
# scanning with find
done < <(
	# shellcheck disable=SC2086
	find ${bsdextregex} ${fastq_dir} -type f ${gnuextregex} -iregex ".*${rxextentsion}" -print0 )

# final words
if (( paired )); then
	if (( paired == numsam )); then
		info "${paired} pair end samples"
	else
		warn "${numsam} samples, of which ${paired} paired"
	fi
else
	info "${numsam} single end samples"
fi
if (( numdup )); then
	if (( 0 == (numdup % ( paired + numsam )) )); then
		info "${numdup} duplicates ( $((numdup /  ( paired + numsam ) + 1))x )"
	else
		warn "$(( paired + numsam )) samples, but only ${numdup} duplicates" "(missing: $((numdup % ( paired + numsam ) )) )"
	fi
fi
if (( ! ALLOK )); then
	warn Some errors
	exit 1
fi

(( staging )) && mv -v "${tsv}" "${tsv%%${staging_suffix}}"
info All Ok
exit 0
