"""
Imports non-compressed Beagle .gprobs files, which contains the
probabilities for each of the 3 SNP genotypes.  The resulting
spreadsheet is composed of single precision floating point columns
where each SNP is stored as a single column, representing allelic
dosage (0*AA + 1*AB + 2*BB).  A and B alleles are determined by the
order of the three columns in the gprobs spreadsheet.

Author: Christophe Lambert  and Bryce Christensen, Golden Helix Inc.
Last Revised: 2013-12-23
"""

ghi.requireVersion('7.4')
import os

def basename(path):
    (dir, file) = os.path.split(path)
    return file

def scan_files(files,delim):
    
    all_samps = set()
    for filepath in files:
        f = open(filepath, "r")       
        line = f.readline().rstrip()
        if delim not in line:
            ghi.message("The specified delimiter was not found in the files. Please check the delimiter and try again.")
            return None

        v = line.split(delim)    
        sampleid = v[3::3]  
        all_samps.update(sampleid)
    
    all_samps = list(all_samps)
    rowDict = {sample:(i+1) for i,sample in enumerate(all_samps)}
    
    return rowDict,all_samps



def import_loop(filename, files,delim):
    
    file_tuple = scan_files(files,delim)
    if file_tuple == None:
        return
    else:
        rowDict,sampleid = file_tuple

    nR = len(sampleid)

    markers = []
    alleleA = []
    alleleB = []

    progress = ghi.progressDialog("Total Progress....",len(files))
    progress.setDoubleProgressMode(0, 100, "Importing " + basename(files[0])+"...")
    progress.show()

    myBuilder = ghi.dataSetBuilder(filename, len(sampleid))
    myBuilder.addRowLabels("Sample", sampleid)
    
    for i,filepath in enumerate(files):
        if progress.wasCanceled():
            return
        progress.setProgress(i)
        markers,alleleA,alleleB,builder = import_Beagle_gprobs(myBuilder, progress,i, rowDict, nR, markers, alleleA, alleleB, filepath, delim)
        if markers == None:
            return

            
       
    myBuilder2 = ghi.dataSetBuilder(filename + " Alleles",len(markers))
    myBuilder2.addRowLabels('Marker',markers)
    myBuilder2.addCategoricalColumn('alleleA',alleleA)
    myBuilder2.addCategoricalColumn('alleleB',alleleB)

    newSS = myBuilder.finish()
    newSS.appendLog("Imported BEAGLE .gprobs file from \n" + filepath)

    newSS2 = myBuilder2.finish()
    newSS2.appendLog("Imported allele info in BEAGLE .gprobs file from \n" + filepath)
    
    newSS.show()    
    


def import_Beagle_gprobs(myBuilder, progress, i, rowDict, nR, markers, alleleA, alleleB, filepath, delim):
   
    f = open(filepath, "r")
    fileStats = os.stat(filepath)
    fs = int(fileStats.st_size)

    progress.setDoubleProgressMode(0, 100, "Importing " + basename(filepath)+"...")
    progress.setProgress(i)
    progress.show()
    count = 0

    line = f.readline().rstrip()
    v = line.split(delim)

    A = v[1]
    B = v[2]

    sampleid = v[3::3]
        
    lastpos = 0L    
    
    while 1:
        line = f.readline().rstrip()

        if not line: 
            break
        
        lastpos = f.tell()
        if int(100*lastpos/fs) > count:
            count += 1
            progress.setSecondaryProgress(count)

        if progress.wasCanceled():
            progress.setSecondaryProgress(100) #make sure the progress bar completes
            f.close()
            progress.finish()
            return None,None,None,None
        
        v = line.split(delim)
        line = ""
        markerName = v[0]
        
        markers.append(markerName)
        alleleA.append(v[1])
        alleleB.append(v[2])
        
        v = v[3:]
        
        ab = v[1::3]
        bb = v[2::3]
        add = [None for i in range(nR)]            
        missing=False
        
        for i in range(len(ab)):
            if abs(float(ab[i]) - 0.333) < 1e-8 and abs(float(bb[i]) - 0.333) < 1e-8:
                missing = True
            else:
                row = rowDict[sampleid[i]]
                add[row-1] = float(ab[i]) + (2 * float(bb[i]))
        
        
        myBuilder.addRealColumn(markerName, add, 0)
        

    progress.setSecondaryProgress(100)     
    
    f.close()
    
    #progress.setSecondaryProgress(0)
    
    return markers,alleleA,alleleB,myBuilder


def prompt():
    pdl = [{"label":"Spreadsheet name:",'type':'string','default':'Beagle Files','name':'filename'},
            {"label":"Choose the BEAGLE gprobs file to import:", "type":"prompt"},
           {"label":"Select one or more gprobs.txt files...", "type":"files","filter":"BEAGLE gprobs text file (*.gprobs);;All Files (*)","name":"files"},
           {"label":"File format:","type":"combo","list":["Tab Delimited","Whitespace Delimited","Comma Delimited"],"name":"delim"}]
           
    prompt = ghi.promptDialog(pdl,title = "Import BEAGLE gprobs as Allelic Dosage",width = 400)
    if not prompt:
        return
        
    filename = prompt['filename']
    files = prompt["files"]
    delimStr = prompt["delim"]
    if delimStr == "Tab Delimited":
        delim = '\t'
    elif delimStr == "Whitespace Delimited":
        delim = ' '
    else:
        delim = ','
        
    import_loop(filename, files, delim)

try:
    prompt()        
except:
    ghi.error()
