data_string = """
chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829 0.473171 54 37 60 54 0 0 205
chr1 35720 36081 NR_026818_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 35720 36081 NR_026820_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895 0.428105 212 218 175 313 0 0 918
chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775 0.633225 997 1194 1924 809 0 0 4924
chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724 0.448276 13 12 14 19 0 0 58
chr1 140074 140566 NR_039983_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_140075_r 0 - 0.475610 0.524390 126 144 114 108 0 0 492
chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028325_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028327_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028325_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028327_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324438 326938 NR_028327_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.375200 0.624800 400 1013 549 538 0 0 2500
chr1 324438 328581 NR_028322_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
chr1 324438 328581 NR_028325_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
"""
# this looks suspicously csv-like
import pandas
import StringIO
buf = StringIO.StringIO(data_string)
# this will create a DataFrame object with header: 0, 1, 2, ...
# if you have the file path, you can use that instead of the StringIO buffer
df = pandas.read_csv(buf, delim_whitespace=True, header=None)
>>> print df
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_35721_r 0 - 0.398892
2 chr1 35720 36081 NR_026820_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
8 chr1 323891 324060 NR_028325_exon_0_0_chr1_323892_f 0 + 0.426035
9 chr1 323891 324060 NR_028327_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
11 chr1 324287 324345 NR_028325_exon_1_0_chr1_324288_f 0 + 0.551724
12 chr1 324287 324345 NR_028327_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.378228
15 chr1 324438 328581 NR_028325_exon# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
0_chr1_324439_f 0 + 0.378228
# ... more data skipped...
seen_start_end
. Для каждой строки создайте кортеж из столбцов 2 и 3 с именемstart_end
. Еслиstart_end
вseen_start_end
, тоcontinue
в следующую строку. В противном случае сохраните строку и добавьтеstart_end
кseen_start_end
. - person Steven Rumbalski   schedule 24.06.2015