Boolean Biotech

iGem posts each team's abstract every year (for example, see 2013 team abstracts). These abstracts are fascinating reading, so I wanted to compile them into one place.

I first tried to parse the text into a spreadsheet using Kimono. I thought this would be pretty easy, but unfortunately Kimono could not resolve the inconsistent formatting of the page. Instead I ended up using scrapy (code included at the end of this blogpost). Scrapy is a pretty heavyweight program, but it worked.

iGEM Projects spreadsheet or HTML version of the spreadsheet

# -*- coding: utf-8 -*-
# Run with: scrapy crawl test1 -o test1.json
import scrapy

class IgemItem(scrapy.Item):
    year = scrapy.Field()
    title = scrapy.Field()
    subtitle = scrapy.Field()
    description = scrapy.Field()

class Test1Spider(scrapy.Spider):
    name = "test1"
    #allowed_domains = ["igem.org"]
    start_urls = (
        'http://2008.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2009.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2010.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2011.igem.org/Jamboree/Team_Abstracts',
        'http://2012.igem.org/Jamboree/Team_Abstracts',
        'http://2013.igem.org/Jamboree/Team_Abstracts',
    )

    def parse(self, response):
        def _clean(sel):
            #if len(sel.extract()) == 0:
            #    return ""
            return ''.join(sel.extract()).replace("\n","").strip(": ").strip()

        if "2008" in response.url:
            for sel in response.xpath('//h4/span[contains(@class, "mw-headline")]'):
                item = IgemItem()
                item['year'] = response.url[7:11]
                item['title'] = _clean(sel.xpath("a/text()"))
                item['subtitle'] = _clean(sel.xpath('../following-sibling::p[1]/i/text()'))
                item['description'] = _clean(sel.xpath('../following-sibling::p[2]/text()'))
                yield item
        else:
            if any(yr in response.url for yr in ("2009","2010")):
                xp = '//span[contains(@class, "mw-headline")]'
            else:
                xp = '//h4/span[contains(@class, "mw-headline")]'

            for sel in response.xpath(xp):
                item = IgemItem()
                item['year'] = response.url[7:11]
                item['title'] = _clean(sel.xpath("a/text()"))
                item['subtitle'] = _clean(sel.xpath("text()"))
                item['description'] = _clean(sel.xpath('../following-sibling::p[1]/text()'))
                yield item

Comment