Usuário:Abdo/MMMNS/Necroanalisador

De Stoa
Ir para: navegação, pesquisa

#! /bin/env python

# the fields in the data file are:
# ['ch_sim', 'fnomep', 'ano_obito', 'dtobito', 'dtnasc', 'sexo', 'codmunres', 'mun_res6']

# the municipalities in the data file are:
# set(['130260', '420820', '292740', '530010', '330455', '354850', '261160', '500270', '410690', '310620'])
# 410690 is Curitiba

import sys, math

def mean_err(seq):
    avg = sum(seq)/float(len(seq))
    dev = math.sqrt(sum([ (x-avg)**2 for x in seq])/max(len(seq)-1,1))
    return (avg,dev/math.sqrt(len(seq)))

# prints frequency along years for a single name
# TODO make main() call this, depends on splitting morgue() into functions
def autopsy(name, histogram_name_yearofbirth):
    if histogram_name_yearofbirth.haskey(sys.argv[1]):
        for i in sorted(histogram_name_yearofbirth[name]):
            print i, histogram_name_yearofbirth[name][i]
    print

# this does everything, it should be split into functions someday
def morgue(municipality, num_names, num_extracts=1, requires={}, filter={}):
    raw_dead = open("obitos_"+municipality)
    campos = raw_dead.readline().rstrip("\n").split("\t")

    # extract data into a list of dictionaries while filtering
    dead = list()
    for m in raw_dead:
        m = m.rstrip("\n").split("\t")
        d = dict()
        for j in range(len(campos)):
            d[campos[j]] = m[j]
        if all( (d[i] in requires[i] if type(requires[i]) is list else\
                 d[i] == requires[i]
                    for i in requires) ) and\
           not any( (d[i] in filter[i] if type(filter[i]) is list else\
                     d[i] == filter[i] for i in filter) ):
            dead.append(d)

    names = set([ m["fnomep"] for m in dead ])

    # make a histogram based on name and year of birth
    histogram_name_yearofbirth = dict()
    for i in dead:
        histogram_name_yearofbirth[ i["fnomep"] ][ i["dtnasc"][-4:] ] =\
        histogram_name_yearofbirth\
          .setdefault(i["fnomep"], dict())\
            .setdefault(i["dtnasc"][-4:], 0) + 1

    # make a histogram based year of birth alone
    histogram_yearofbirth = dict()
    for i in dead:
        histogram_yearofbirth[ i["dtnasc"][-4:] ] =\
            histogram_yearofbirth.setdefault( i["dtnasc"][-4:], 0) + 1

    # normalize histogram_name_yearofbirth over yearofbirth to get frequency
    for i in histogram_name_yearofbirth:
        for j in histogram_yearofbirth:
            histogram_name_yearofbirth[i][j]=float(histogram_name_yearofbirth[i].setdefault(j,0))\
                                    /histogram_yearofbirth[j]

    # calculate the relative standard deviation of frequency among years
    variability = dict()
    frequency = dict()
    for i in names:
        m, e = mean_err(histogram_name_yearofbirth[i].values())
        variability[i] = e/m
        frequency[i] = m

    # splits the names into the number of exctracts requested
    totalfrequency = sum(frequency.values())
    print "city %s, totfreq %f\n" % (municipality, totalfrequency)
    extracts = [list() for i in range(num_extracts)]
    extracts_sum = [0 for i in range(num_extracts)]
    extrato = 0
    for i,j in sorted(frequency.iteritems(), key=lambda x: x[1], reverse=True):
        if extracts_sum[extrato] > totalfrequency/num_extracts\
           and extrato+1 < num_extracts:
            extrato += 1
        extracts[extrato].append(i)
        extracts_sum[extrato] += j

    # for each extract, print some info and the least variability names
    elensum = sum([len(extracts[i]) for i in xrange(num_extracts)])
    for i in range(len(extracts)):
        print "Extract %d:" % i
        print "names %d, fraction of names %f, fraction of frequency %f" % (len(extracts[i]), len(extracts[i])/float(elensum), extracts_sum[i])
        print "Group from extract %d:" % i
        acumfreq = 0.0
        print "NAME\t\tFREQUENCY\tVARIABILITY"
        for j in sorted(extracts[i], key=lambda x: variability[x], reverse=False)[:num_names/num_extracts]:
            print "%s%s%f\t%f" % (j, "\t" if len(j)>7 else "\t\t", frequency[j], variability[j])
            acumfreq += frequency[j]
        print "Total frequency in group %f\n" % acumfreq
    print

def main():
    municipalities = set(['410690'])
    num_names = 45
    num_extracts = 3
    requires = dict(sexo='M') # dict(sexo=['M','F'])
    filter = {}
    for m in municipalities:
        morgue(m, num_names, num_extracts, requires, filter)

main()


Ferramentas pessoais

Variantes
Ações
Navegação
Imprimir/exportar
Ferramentas