| 1 | %% Author: Caoyuan Deng, Anders Nygreni, Steve Vinoski |
|---|
| 2 | -module(tbray9). |
|---|
| 3 | -compile([native, inline]). |
|---|
| 4 | -export([start/1]). |
|---|
| 5 | |
|---|
| 6 | -define(BUFFER_SIZE, (1024 * 20000)). |
|---|
| 7 | -define(PAT, "/nehW/gniogno/ TEG\" ]"). % lists:reverse("] \"GET /ongoing/When/"). |
|---|
| 8 | -define(PAT_LEN, 21). % length("/nehW/gniogno/ TEG\" ]"). |
|---|
| 9 | -define(SKIP_LEN, 26). % length("] \"GET /ongoing/When/200x/"). |
|---|
| 10 | -define(KEY_OFFSET, 37). % length("] \"GET /ongoing/When/200x/2000/10/10/"). |
|---|
| 11 | |
|---|
| 12 | start([FileName]) -> |
|---|
| 13 | FileSize = filelib:file_size(FileName), |
|---|
| 14 | ProcN = FileSize div ?BUFFER_SIZE, |
|---|
| 15 | ProcN1 = case FileSize rem ?BUFFER_SIZE of 0 -> ProcN; _ -> ProcN + 1 end, |
|---|
| 16 | BmCtx = bm_init(), |
|---|
| 17 | Results = [wait_result(Worker) || Worker <- read_file(FileName, ?BUFFER_SIZE, ProcN1, BmCtx)], |
|---|
| 18 | {Dicts, Segs} = lists:foldl(fun ({Dict, Head, Tail}, {Dicts, Segs}) -> |
|---|
| 19 | {[Dict | Dicts], [Head, Tail | Segs]} |
|---|
| 20 | end, {[], []}, Results), |
|---|
| 21 | Segs1 = [Seg || {_, Seg} <- lists:keysort(1, Segs)], |
|---|
| 22 | Dict = scan_chunk({list_to_binary(Segs1), BmCtx}), |
|---|
| 23 | print_result(merge_dicts([Dict | Dicts])), |
|---|
| 24 | halt(). |
|---|
| 25 | |
|---|
| 26 | read_file(FileName, Size, ProcN, BmCtx) -> |
|---|
| 27 | [spawn_worker(self(), fun scan_file/1, {FileName, Size, I, BmCtx}) |
|---|
| 28 | || I <- lists:seq(0, ProcN - 1)]. |
|---|
| 29 | |
|---|
| 30 | scan_file({FileName, Size, I, BmCtx}) -> |
|---|
| 31 | {ok, File} = file:open(FileName, [raw, binary]), |
|---|
| 32 | {ok, Bin} = file:pread(File, Size * I, Size), |
|---|
| 33 | file:close(File), |
|---|
| 34 | HeadL = split_on_first_newline(Bin, 0, 1), |
|---|
| 35 | TailS = split_on_first_newline(Bin, size(Bin) - 1, -1), |
|---|
| 36 | DataL = TailS - HeadL, |
|---|
| 37 | <<Head:HeadL/binary, Data:DataL/binary, Tail/binary>> = Bin, |
|---|
| 38 | {scan_chunk({Data, BmCtx}), {I * 10, Head}, {I * 10 + 1, Tail}}. |
|---|
| 39 | |
|---|
| 40 | split_on_first_newline(Bin, S, Direction) -> |
|---|
| 41 | case Bin of |
|---|
| 42 | <<_:S/binary,$\n,_/binary>> -> S + 1; |
|---|
| 43 | <<_:S/binary,_,_/binary>> -> split_on_first_newline(Bin, S + Direction, Direction); |
|---|
| 44 | _ -> S |
|---|
| 45 | end. |
|---|
| 46 | |
|---|
| 47 | scan_chunk({Bin, BmCtx}) -> scan_chunk_1(Bin, size(Bin), 0, dict:new(), BmCtx). |
|---|
| 48 | scan_chunk_1(Bin, DataL, S, Dict, BmCtx) when S < DataL - ?KEY_OFFSET -> |
|---|
| 49 | case bm_match(Bin, S, BmCtx) of |
|---|
| 50 | {true, _} -> |
|---|
| 51 | case match_until_space_newline(Bin, S + ?KEY_OFFSET, size(Bin)) of |
|---|
| 52 | {true, E} -> |
|---|
| 53 | Skip = S + ?SKIP_LEN, L = E - Skip, |
|---|
| 54 | <<_:Skip/binary, Key:L/binary, _/binary>> = Bin, |
|---|
| 55 | scan_chunk_1(Bin, DataL, E + 1, dict:update_counter(Key, 1, Dict), BmCtx); |
|---|
| 56 | {false, E} -> |
|---|
| 57 | scan_chunk_1(Bin, DataL, E + 1, Dict, BmCtx) |
|---|
| 58 | end; |
|---|
| 59 | {false, Shift} -> |
|---|
| 60 | scan_chunk_1(Bin, DataL, S + Shift, Dict, BmCtx) |
|---|
| 61 | end; |
|---|
| 62 | scan_chunk_1(_, _, _, Dict, _) -> Dict. |
|---|
| 63 | |
|---|
| 64 | match_until_space_newline(Bin, S, DataL) when S < DataL -> |
|---|
| 65 | case Bin of |
|---|
| 66 | <<_:S/binary,10,_/binary>> -> {false, S}; |
|---|
| 67 | <<_:S/binary,$.,_/binary>> -> {false, S}; |
|---|
| 68 | <<_:S/binary,_,$ ,_/binary>> -> {true, S + 1}; |
|---|
| 69 | _ -> match_until_space_newline(Bin, S + 1, DataL) |
|---|
| 70 | end; |
|---|
| 71 | match_until_space_newline(_, S, _) -> {false, S}. |
|---|
| 72 | |
|---|
| 73 | spawn_worker(Parent, F, A) -> |
|---|
| 74 | erlang:spawn_monitor(fun() -> Parent ! {self(), F(A)} end). |
|---|
| 75 | |
|---|
| 76 | wait_result({Pid, Ref}) -> |
|---|
| 77 | receive |
|---|
| 78 | {'DOWN', Ref, _, _, normal} -> receive {Pid, Result} -> Result end; |
|---|
| 79 | {'DOWN', Ref, _, _, Reason} -> exit(Reason) |
|---|
| 80 | end. |
|---|
| 81 | |
|---|
| 82 | merge_dicts([D1,D2|Rest]) -> |
|---|
| 83 | merge_dicts([dict:merge(fun(_, V1, V2) -> V1 + V2 end, D1, D2) | Rest]); |
|---|
| 84 | merge_dicts([D]) -> D. |
|---|
| 85 | |
|---|
| 86 | print_result(Dict) -> |
|---|
| 87 | SortedList = lists:reverse(lists:keysort(2, dict:to_list(Dict))), |
|---|
| 88 | [io:format("~b\t: ~s~n", [V, K]) || {K, V} <- lists:sublist(SortedList, 10)]. |
|---|
| 89 | |
|---|
| 90 | bm_init() -> |
|---|
| 91 | Default = dict:from_list([{C, ?PAT_LEN} || C <- lists:seq(1, 255)]), |
|---|
| 92 | Dict = bm_set_shifts(lists:reverse(?PAT), ?PAT_LEN, 1, Default), |
|---|
| 93 | list_to_tuple([Pos || {_, Pos} <- lists:sort(dict:to_list(Dict))]). |
|---|
| 94 | |
|---|
| 95 | bm_set_shifts([C|T], StrLen, Pos, Dict) -> |
|---|
| 96 | bm_set_shifts(T, StrLen, Pos + 1, dict:store(C, StrLen - Pos, Dict)); |
|---|
| 97 | bm_set_shifts([], _, _, Dict) -> Dict. |
|---|
| 98 | |
|---|
| 99 | bm_match(Bin, S, Tab) -> bm_match_1(Bin, S + ?PAT_LEN - 1, ?PAT, Tab, 0). |
|---|
| 100 | bm_match_1(Bin, S, [C|T], Tab, Count) -> |
|---|
| 101 | <<_:S/binary, C1, _/binary>> = Bin, |
|---|
| 102 | case C1 of |
|---|
| 103 | C -> |
|---|
| 104 | bm_match_1(Bin, S - 1, T, Tab, Count + 1); |
|---|
| 105 | _ -> |
|---|
| 106 | case element(C1, Tab) of |
|---|
| 107 | ?PAT_LEN -> {false, ?PAT_LEN}; |
|---|
| 108 | Shift when Shift =< Count -> {false, 1}; |
|---|
| 109 | Shift -> {false, Shift - Count} |
|---|
| 110 | end |
|---|
| 111 | end; |
|---|
| 112 | bm_match_1(_, _, [], _, _) -> {true, ?PAT_LEN}. |
|---|