
rule toml2json:
    input: r"MAD_v1/raw_metadata/pangaea_metadata.toml"
    output: r"MAD_v1/raw_metadata/pangaea_metadata.json"
    shell:
        "python scripts/0_toml2json.py {input} {output}"

rule clip_raw_sound:
    input:
        sound=r"data/raw_audio"
    output:
        clip = directory("data/clip"),               
        excel1 = r"data/excel/clip_record.csv",
        excel2 = r"data/excel/clip_record_final.csv",     
    shell:
        "python scripts/1_clip_raw_sound.py {input.sound} {output.clip} {output.excel1} {output.excel2} "


rule add_metadata:
    input: 
      file = r"data/excel/clip_record_final.csv",
      meta1 = r"MAD_v1/raw_metadata/onms_metadata.json",
      meta2 = rules.toml2json.output,
      meta3 = r"MAD_v1/raw_metadata/williams_metadata.json",
    output:   
        excel = r"data/excel/metadata_final_data.xlsx",     
    shell:
        "python scripts/2_add_metadata.py {input.file} {input.meta1} {input.meta2} {input.meta3} {output.excel}"


rule map_individuals:
    input: 
      ttl = r"MAD_v1/MAD_v1_TTL.ttl",
      excel = r"data/excel/metadata_final_data.xlsx",
    output:   
       jsonld = r"MAD_v1/map_individuals_metadata.jsonld",     
    shell:
        "python scripts/3_map_individuals.py {input.ttl} {input.excel} {output.jsonld}"


rule query_individuals:
    input: 
      jsonld = r"MAD_v1/map_individuals_metadata.jsonld"   
    output:   
       jsonld = r"MAD_v1/filtered_metadata.jsonld",     
    shell:
        "python scripts/4_query_individuals.py {input.jsonld} {output.jsonld}"


rule clip2features:
    params:
        url = r"https://zenodo.org/records/11071202/files/SurfPerch_v1.0.zip",
    input: 
      jsonld = r"MAD_v1/filtered_metadata.jsonld",
      clip = r"data/clip",     
    output:   
       librosa = r"data/excel/feature_embedding.csv", 
       model = directory("model")
    shell:
        "python scripts/5_clip2features.py {params.url} {input.jsonld} {input.clip} {output.librosa} {output.model}"


rule umap:
    input: 
        jsonld = r"MAD_v1/filtered_metadata.jsonld",
        clip = r"data/clip",  
        feature = r"data/excel/feature_embedding.csv",  
        excel = r"data/excel/metadata_final_data.xlsx",
    output:   
        umap = r"result_figures/umap.png", 
        pca = r"result_figures/pca.png"      
    shell:
        "python scripts/6_umap.py {input.jsonld} {input.clip} {input.feature} {input.excel} {output.umap} {output.pca} "



rule gmm:
    input: 
        jsonld = r"MAD_v1/filtered_metadata.jsonld",        
        feature = r"data/excel/feature_embedding.csv",          
    output:   
        gmm = r"result_figures/gmm.png", 
        jsonld = r"MAD_v1/gmm_filtered_metadata.jsonld"   
    shell:
        "python scripts/7_gmm.py {input.jsonld} {input.feature} {output.jsonld} {output.gmm} "


rule all:
    input:       
        r"result_figures/umap.png",
        r"result_figures/pca.png",       
        
        r"result_figures/gmm.png",
        r"MAD_v1/gmm_filtered_metadata.jsonld"




