Usuário:Abdo/MMMNS/Necroanalisador
De Stoa
< Usuário:Abdo | MMMNS
#! /bin/env python # the fields in the data file are: # ['ch_sim', 'fnomep', 'ano_obito', 'dtobito', 'dtnasc', 'sexo', 'codmunres', 'mun_res6'] # the municipalities in the data file are: # set(['130260', '420820', '292740', '530010', '330455', '354850', '261160', '500270', '410690', '310620']) # 410690 is Curitiba import sys, math def mean_err(seq): avg = sum(seq)/float(len(seq)) dev = math.sqrt(sum([ (x-avg)**2 for x in seq])/max(len(seq)-1,1)) return (avg,dev/math.sqrt(len(seq))) # prints frequency along years for a single name # TODO make main() call this, depends on splitting morgue() into functions def autopsy(name, histogram_name_yearofbirth): if histogram_name_yearofbirth.haskey(sys.argv[1]): for i in sorted(histogram_name_yearofbirth[name]): print i, histogram_name_yearofbirth[name][i] print # this does everything, it should be split into functions someday def morgue(municipality, num_names, num_extracts=1, requires={}, filter={}): raw_dead = open("obitos_"+municipality) campos = raw_dead.readline().rstrip("\n").split("\t") # extract data into a list of dictionaries while filtering dead = list() for m in raw_dead: m = m.rstrip("\n").split("\t") d = dict() for j in range(len(campos)): d[campos[j]] = m[j] if all( (d[i] in requires[i] if type(requires[i]) is list else\ d[i] == requires[i] for i in requires) ) and\ not any( (d[i] in filter[i] if type(filter[i]) is list else\ d[i] == filter[i] for i in filter) ): dead.append(d) names = set([ m["fnomep"] for m in dead ]) # make a histogram based on name and year of birth histogram_name_yearofbirth = dict() for i in dead: histogram_name_yearofbirth[ i["fnomep"] ][ i["dtnasc"][-4:] ] =\ histogram_name_yearofbirth\ .setdefault(i["fnomep"], dict())\ .setdefault(i["dtnasc"][-4:], 0) + 1 # make a histogram based year of birth alone histogram_yearofbirth = dict() for i in dead: histogram_yearofbirth[ i["dtnasc"][-4:] ] =\ histogram_yearofbirth.setdefault( i["dtnasc"][-4:], 0) + 1 # normalize histogram_name_yearofbirth over yearofbirth to get frequency for i in histogram_name_yearofbirth: for j in histogram_yearofbirth: histogram_name_yearofbirth[i][j]=float(histogram_name_yearofbirth[i].setdefault(j,0))\ /histogram_yearofbirth[j] # calculate the relative standard deviation of frequency among years variability = dict() frequency = dict() for i in names: m, e = mean_err(histogram_name_yearofbirth[i].values()) variability[i] = e/m frequency[i] = m # splits the names into the number of exctracts requested totalfrequency = sum(frequency.values()) print "city %s, totfreq %f\n" % (municipality, totalfrequency) extracts = [list() for i in range(num_extracts)] extracts_sum = [0 for i in range(num_extracts)] extrato = 0 for i,j in sorted(frequency.iteritems(), key=lambda x: x[1], reverse=True): if extracts_sum[extrato] > totalfrequency/num_extracts\ and extrato+1 < num_extracts: extrato += 1 extracts[extrato].append(i) extracts_sum[extrato] += j # for each extract, print some info and the least variability names elensum = sum([len(extracts[i]) for i in xrange(num_extracts)]) for i in range(len(extracts)): print "Extract %d:" % i print "names %d, fraction of names %f, fraction of frequency %f" % (len(extracts[i]), len(extracts[i])/float(elensum), extracts_sum[i]) print "Group from extract %d:" % i acumfreq = 0.0 print "NAME\t\tFREQUENCY\tVARIABILITY" for j in sorted(extracts[i], key=lambda x: variability[x], reverse=False)[:num_names/num_extracts]: print "%s%s%f\t%f" % (j, "\t" if len(j)>7 else "\t\t", frequency[j], variability[j]) acumfreq += frequency[j] print "Total frequency in group %f\n" % acumfreq print def main(): municipalities = set(['410690']) num_names = 45 num_extracts = 3 requires = dict(sexo='M') # dict(sexo=['M','F']) filter = {} for m in municipalities: morgue(m, num_names, num_extracts, requires, filter) main()