<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Ben Healey &#187; ETL</title>
	<atom:link href="http://benhealey.info/tag/etl/feed/" rel="self" type="application/rss+xml" />
	<link>http://benhealey.info</link>
	<description>Data Aficionado  &#124;  Wellington, New Zealand</description>
	<lastBuildDate>Sat, 14 Jan 2012 20:18:17 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='benhealey.info' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Ben Healey &#187; ETL</title>
		<link>http://benhealey.info</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://benhealey.info/osd.xml" title="Ben Healey" />
	<atom:link rel='hub' href='http://benhealey.info/?pushpress=hub'/>
		<item>
		<title>Music to a Data Geek&#8217;s Ears</title>
		<link>http://benhealey.info/2009/10/04/music-to-a-data-geeks-ears/</link>
		<comments>http://benhealey.info/2009/10/04/music-to-a-data-geeks-ears/#comments</comments>
		<pubDate>Sun, 04 Oct 2009 01:23:24 +0000</pubDate>
		<dc:creator>Ben</dc:creator>
				<category><![CDATA[Thoughts]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Business Intelligence]]></category>
		<category><![CDATA[Data Transformation]]></category>
		<category><![CDATA[ETL]]></category>
		<category><![CDATA[Metrics]]></category>
		<category><![CDATA[Split Testing]]></category>

		<guid isPermaLink="false">http://benhealey.info/?p=170</guid>
		<description><![CDATA[&#8220;If you are looking for a career where your services will be in high demand, you should find something where you provide a scarce, complementary service to something that is getting ubiquitous and cheap. So what&#8217;s getting ubiquitous and cheap? Data. And what is complementary to data? Analysis. So my recommendation is to take lots [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benhealey.info&amp;blog=5583171&amp;post=170&amp;subd=benhealey&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<blockquote><p>&#8220;If you are looking for a career where your services will be in high demand, you should find something where you provide a scarce, complementary service to something that is getting ubiquitous and cheap. So what&#8217;s getting ubiquitous and cheap? Data. And what is complementary to data? Analysis. So my recommendation is to take lots of courses about how to manipulate and analyze data: databases, machine learning, econometrics, statistics, visualization, and so on.&#8221;  <a href="http://freakonomics.blogs.nytimes.com/2008/02/25/hal-varian-answers-your-questions/">Hal Varian, Chief Economist at Google</a></p></blockquote>
<p>Me suffer from <a href="http://en.wikipedia.org/wiki/Confirmation_bias">confirmation bias</a>? Never!<br />
_____</p>
<p>Short URL for this post: <a href="http://wp.me/pnqr9-2K">http://wp.me/pnqr9-2K</a></p><br />Posted in Thoughts Tagged: Analytics, Business Intelligence, Data Transformation, ETL, Metrics, Split Testing <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/benhealey.wordpress.com/170/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/benhealey.wordpress.com/170/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/benhealey.wordpress.com/170/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benhealey.info&amp;blog=5583171&amp;post=170&amp;subd=benhealey&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benhealey.info/2009/10/04/music-to-a-data-geeks-ears/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7242c6f38f9056b8d9a96695535fe428?s=96&#38;d=identicon&#38;r=PG" medium="image">
			<media:title type="html">Ben</media:title>
		</media:content>
	</item>
		<item>
		<title>A Nifty Trick for Transforming Categorical Data</title>
		<link>http://benhealey.info/2009/09/20/a-nifty-trick-for-transforming-categorical-data/</link>
		<comments>http://benhealey.info/2009/09/20/a-nifty-trick-for-transforming-categorical-data/#comments</comments>
		<pubDate>Sun, 20 Sep 2009 02:50:56 +0000</pubDate>
		<dc:creator>Ben</dc:creator>
				<category><![CDATA[Thoughts]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Business Intelligence]]></category>
		<category><![CDATA[Data Transformation]]></category>
		<category><![CDATA[ETL]]></category>
		<category><![CDATA[Metrics]]></category>

		<guid isPermaLink="false">http://benhealey.info/?p=115</guid>
		<description><![CDATA[Categorical variables with lots of options (e.g., country of origin, occupation, postcodes) can be problematic when regression modelling; they have to be dummy coded and use many degrees of freedom, increasing the potential for model overfitting.  The typical approaches to dealing with this are to: Discard the variable if it doesn&#8217;t appear it will be [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benhealey.info&amp;blog=5583171&amp;post=115&amp;subd=benhealey&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Categorical variables with lots of options (e.g., country of origin, occupation, postcodes) can be problematic when regression modelling; they have to be dummy coded and use many degrees of freedom, increasing the potential for model overfitting.  The typical approaches to dealing with this are to:</p>
<ul>
<li>Discard the variable if it doesn&#8217;t appear it will be a good discriminator. It is sometimes hard to tell this up front when you have loads of categories.</li>
<li>Roll the categories up into larger sets based on conceptual similarity.  This can work for ordinal or geographic data, but is more difficult for purely nominal variables.  There is also the risk that you&#8217;ll &#8216;average away&#8217; some of the predictive value in the variable.</li>
<li>Use a statistical technique (e.g., a decision tree) to work out groupings of categories based on their discriminative power.  This may make for groupings that are hard to explain.</li>
</ul>
<p>Another option I&#8217;ve recently come across is to convert the categorical variable to a metric-level variable using historic response data.  For instance, say you&#8217;ve been collecting your customer&#8217;s postcodes for a while and are looking to employ this variable in a predictive model.  Perhaps you are predicting response to a mailing offer (or something similar) which has been running for at least one learning cycle.  A potential way to deal with the &#8216;too many categories&#8217; problem would be to calculate the proportion of people contacted in each postcode during prior mailings who responded to the offer.  Voilà!  You&#8217;ve now got a metric level and continuous variable to play with.  You can apply the historic response values to any new prospects you are looking to score by matching on the postcode.</p>
<p>There are at least a couple of caveats to consider when attempting this.  One is that the proportion will be less robust when you have very few people in a specific category historically (e.g., rural postcodes).  In these cases you might have to do some category roll-ups first.  Another potential issue is that it assumes historic contacts were made at random, or according to some mechanism that will also be applied in future selection processes, such that you can consider the prior contacts &#8216;representative&#8217; of category membership for the purposes of your modelling.  Violations of the assumption would probably require some statistical adjustment to get around.</p>
<p>If anyone sees other potential issues with this approach, or has other alternatives they use to deal with problematic categorical variables, feel free to comment!</p>
<p>_____</p>
<p>Short URL for this post: <a href="http://wp.me/pnqr9-1R">http://wp.me/pnqr9-1R</a></p><br />Posted in Thoughts Tagged: Analytics, Business Intelligence, Data Transformation, ETL, Metrics <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/benhealey.wordpress.com/115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/benhealey.wordpress.com/115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/benhealey.wordpress.com/115/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benhealey.info&amp;blog=5583171&amp;post=115&amp;subd=benhealey&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benhealey.info/2009/09/20/a-nifty-trick-for-transforming-categorical-data/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7242c6f38f9056b8d9a96695535fe428?s=96&#38;d=identicon&#38;r=PG" medium="image">
			<media:title type="html">Ben</media:title>
		</media:content>
	</item>
	</channel>
</rss>
