Due October 7

Using mapreduce build a database of the words in the h1 tags of web pages. You will be given a list of urls. The urls will be be in file with one url per line. Your program will fetch the web pages and extract the words inside the <h1> tags. For this assignment words are separated by a space. You do not have to worry about punctuation. Once you have the words you will build a table and/or database that uses the words as keys. The value(s) associated with the key contains all the urls of the pages that contain the word in an h1 tag. Assume that we have two pages http://foo and http://bar with the following contents.

http:foo

http:bar

Then our table/database looks something like:

cat	[http:foo, http:bar]
mat	[http:bar]
rat	[http:foo]

That is at the key "cat" we have two entires, one for each of the pages the word is found in.

Test Pages

Here is a file containing urls to pages with h1 tags. In case you can't access the file here is the contents of the file:

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample1.html

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample2.html

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample3.html

http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample4.html

http://bismarck.sdsu.edu/~whitney/assignment2Data/sample5.html

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample6.html

http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample7.html

http://bismarck.sdsu.edu/~whitney/assignment2Data/sample8.html

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample9.html

http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample10.html

http://bismarck.sdsu.edu/~whitney/assignment2Data/sample11.html

http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample12.html

Write a supervisor to keep the processes in part 1 alive and make an Erlang application for all the code so you can start/stop your program with application:start/application:stop functions.

For those that could not finish assignment one code for assignment 1 problem 3 is below. While they may not be the best way to solve the problems, it does work. Unit tests are given at the end to show how the code is issued.

Assignment 1 Code

File: matching.erl

-module(matching).

-compile(export_all).

split(Pattern,Target) ->

split(Pattern,Target,[]).

split(_Pattern,[],Acc) ->

[lists:reverse(Acc),""];

split(Pattern,Target,Acc) ->

case lists:prefix(Pattern,Target) of

true -> [lists:reverse(Acc),lists:nthtail(length(Pattern),Target)];

false -> [H|T] = Target,

split(Pattern,T, [H|Acc])

end.

File: html_parsing.erl

-module(html_parsing).

-compile(export_all).

tag_contents ("",_) ->

[];

tag_contents (_Tag,"") ->

[];

tag_contents (Tag,HtmlString) ->

io:format("Start ~p ~p~n", [Tag, HtmlString]),

StartTag = "<" ++ string:strip(Tag) ++ ">",

EndTag = "</" ++ string:strip(Tag) ++ ">",

tag_contents(string:to_lower(StartTag),string:to_lower(EndTag),string:to_lower(HtmlString), []).

tag_contents(_StartTag,_EndTag,"", AllContents) ->

lists:reverse(AllContents);

tag_contents(StartTag,EndTag,HtmlString, AllContents) ->

io:format("~p ~p ~p~n", [StartTag, EndTag, HtmlString]),

[_PreTag,TagContentsPlus] = matching:split(StartTag, HtmlString),

[TagContents,Remainder] = matching:split(EndTag, TagContentsPlus),

TagContents == "" ->

tag_contents(StartTag,EndTag, Remainder, AllContents);

true -> tag_contents(StartTag,EndTag, Remainder, [TagContents|AllContents])

end.

File: assignment1Test.erl

-include_lib("eunit/include/eunit.hrl").

-module(assignment1Test).

-import(matching).

pattern_split_test_() ->

[

?_assert(["ac","abc"] == matching:split("ab","acababc")),

?_assert(["","ababc"] == matching:split("ac","acababc")),

?_assert(["acaba",""] == matching:split("bc","acababc")),

?_assert(["acababc",""] == matching:split("aa","acababc")),

?_assert(["","acababc"] == matching:split("","acababc")),

?_assert(["",""] == matching:split("aa","")),

?_assert(["1ab", "c"] == matching:split("A", "1abAc"))

tag_contents_test_() ->

[

?_assert([] == html_parsing:tag_contents("h1","")),

?_assert([] == html_parsing:tag_contents("h1","cat rat")),

?_assert(["cat"] == html_parsing:tag_contents("h1","<h1>cat</h1>")),

?_assert(["cat"] == html_parsing:tag_contents(" H1 ","<h1>cat</h1>")),

?_assert(["a", "c"] == html_parsing:tag_contents(" h1 ","<H1>a</h1>b<h1>c</h1>d")),

?_assert(["d", "g"] == html_parsing:tag_contents("h2","<h1>a</h1>b<p>c<h2>d</h2>e</p>f<h2>g</h2>"))