iGem posts each team's abstract every year (for example, see 2013 team abstracts). These abstracts are fascinating reading, so I wanted to compile them into one place.
I first tried to parse the text into a spreadsheet using Kimono. I thought this would be pretty easy, but unfortunately Kimono could not resolve the inconsistent formatting of the page. Instead I ended up using scrapy (code included at the end of this blogpost). Scrapy is a pretty heavyweight program, but it worked.
iGEM Projects spreadsheet or HTML version of the spreadsheet
# -*- coding: utf-8 -*- # Run with: scrapy crawl test1 -o test1.json import scrapy class IgemItem(scrapy.Item): year = scrapy.Field() title = scrapy.Field() subtitle = scrapy.Field() description = scrapy.Field() class Test1Spider(scrapy.Spider): name = "test1" #allowed_domains = ["igem.org"] start_urls = ( 'http://2008.igem.org/Jamboree/Project_Abstract/Team_Abstracts', 'http://2009.igem.org/Jamboree/Project_Abstract/Team_Abstracts', 'http://2010.igem.org/Jamboree/Project_Abstract/Team_Abstracts', 'http://2011.igem.org/Jamboree/Team_Abstracts', 'http://2012.igem.org/Jamboree/Team_Abstracts', 'http://2013.igem.org/Jamboree/Team_Abstracts', ) def parse(self, response): def _clean(sel): #if len(sel.extract()) == 0: # return "" return ''.join(sel.extract()).replace("\n","").strip(": ").strip() if "2008" in response.url: for sel in response.xpath('//h4/span[contains(@class, "mw-headline")]'): item = IgemItem() item['year'] = response.url[7:11] item['title'] = _clean(sel.xpath("a/text()")) item['subtitle'] = _clean(sel.xpath('../following-sibling::p[1]/i/text()')) item['description'] = _clean(sel.xpath('../following-sibling::p[2]/text()')) yield item else: if any(yr in response.url for yr in ("2009","2010")): xp = '//span[contains(@class, "mw-headline")]' else: xp = '//h4/span[contains(@class, "mw-headline")]' for sel in response.xpath(xp): item = IgemItem() item['year'] = response.url[7:11] item['title'] = _clean(sel.xpath("a/text()")) item['subtitle'] = _clean(sel.xpath("text()")) item['description'] = _clean(sel.xpath('../following-sibling::p[1]/text()')) yield itemComment