-
Notifications
You must be signed in to change notification settings - Fork 707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding second moment of values per key for Typed-API reduce operations #1279
base: develop
Are you sure you want to change the base?
Changes from 8 commits
05df02b
d4aff6b
c8eebf8
b48da0b
0f495b1
58c3f12
4e3db4f
3f5cb5e
2b767c0
1a8d6e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -476,7 +476,13 @@ package com.twitter.scalding { | |
} | ||
} | ||
|
||
/** In the typed API every reduce operation is handled by this Buffer */ | ||
private[scalding] object SkewMonitorCounters { | ||
// Strangely, if group name and key name are different, then the counter would be zero | ||
val KeyCount = "scalding.skew.monitor.key.count" | ||
val ValuesCountSum = "scalding.skew.monitor.value.count.sum" | ||
val ValuesCountSquareSum = "scalding.skew.monitor.value.sum.square" | ||
} | ||
|
||
class TypedBufferOp[K, V, U]( | ||
conv: TupleConverter[K], | ||
@transient reduceFn: (K, Iterator[V]) => Iterator[U], | ||
|
@@ -491,13 +497,23 @@ package com.twitter.scalding { | |
.asScala | ||
.map(_.getObject(0).asInstanceOf[V]) | ||
|
||
val caches = values.toList | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so, we can't do this (because it would force everything to memory), but what about something like this: class CountingIterator[T](wraps: Iterator[T]) extends Iterator[T] {
private[this] var nextCalls = 0
def hasNext = wraps.hasNext
def next = { nextCalls += 1; wraps.next }
def seen: Int = nextCalls
} then we could wrap There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point |
||
val numValuesPerKey = caches.size.toLong | ||
|
||
// Avoiding a lambda here | ||
val resIter = reduceFnSer.get(key, values) | ||
val resIter = reduceFnSer.get(key, caches.toIterator) | ||
while (resIter.hasNext) { | ||
val tup = Tuple.size(1) | ||
tup.set(0, resIter.next) | ||
val t2 = resIter.next | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we remove this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
|
||
tup.set(0, t2) | ||
oc.add(tup) | ||
} | ||
|
||
flowProcess.increment(SkewMonitorCounters.KeyCount, SkewMonitorCounters.KeyCount, 1L) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably better to use a group like "scalding.debug" since I think we want to group them up so they are easy to see next to each other in the job tracker UI. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we want the group and counter to be the same string. We want the group to be something like "scalding debug" and the counter value is fine. |
||
flowProcess.increment(SkewMonitorCounters.ValuesCountSum, SkewMonitorCounters.ValuesCountSum, numValuesPerKey) | ||
flowProcess.increment(SkewMonitorCounters.ValuesCountSquareSum, SkewMonitorCounters.ValuesCountSquareSum, numValuesPerKey * numValuesPerKey) | ||
} | ||
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1817,6 +1817,46 @@ class CounterJobTest extends WordSpec with Matchers { | |
} | ||
} | ||
|
||
class ReduceValueCountJob(args: Args) extends Job(args) { | ||
TypedPipe.from(List((1, (1, 1)), (2, (2, 2)), (1, (3, 3)), (3, (3, 3)))) | ||
.group | ||
.foldLeft((0, 0)){ (a, b) => (a._1 + b._1, a._2 + b._2) } | ||
.toTypedPipe | ||
.map{ | ||
case (a: Int, (b: Int, c: Int)) => | ||
(a, b, c) | ||
} | ||
.write(TypedTsv[(Int, Int, Int)](args("output"))) | ||
} | ||
|
||
class ReduceValueCounterTest extends WordSpec with Matchers { | ||
"Reduce Values Count" should { | ||
JobTest(new com.twitter.scalding.ReduceValueCountJob(_)) | ||
.arg("output", "output0") | ||
.counter(SkewMonitorCounters.KeyCount, SkewMonitorCounters.KeyCount){ | ||
x => | ||
x should be(3) | ||
} | ||
.counter(SkewMonitorCounters.ValuesCountSquareSum, SkewMonitorCounters.ValuesCountSquareSum) { | ||
x => | ||
// key 1 has two values, thus 2^2 = 4. key 2 and 3 has only one respectively | ||
x should be(4 + 1 + 1) | ||
} | ||
.counter(SkewMonitorCounters.ValuesCountSum, SkewMonitorCounters.ValuesCountSum) { | ||
x => | ||
// key 1 has two values, thus 2^2 = 4. key 2 and 3 has only one respectively | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like a copied comment, don't want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah you're right. it's a copied commend |
||
x should be (2 + 1 + 1) | ||
} | ||
.sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("output0")){ | ||
tuples => | ||
|
||
} | ||
.runHadoop | ||
.finish | ||
} | ||
|
||
} | ||
|
||
object DailySuffixTsvJob { | ||
val strd1 = "2014-05-01" | ||
val strd2 = "2014-05-02" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,4 +163,33 @@ class ReduceOperationsTest extends WordSpec with Matchers { | |
.runHadoop | ||
.finish | ||
} | ||
|
||
"Std for number of values aassocated with each key " should { | ||
class SDForReduceOperation(args: Args) extends Job(args) { | ||
TypedPipe.from(List((1, (1, 1)), (2, (2, 2)), (1, (3, 3)), (3, (3, 3)))) | ||
.group | ||
.foldLeft((0, 0)){ (a, b) => (a._1 + b._1, a._2 + b._2) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. FIxed. |
||
.toTypedPipe | ||
.map{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
case (a: Int, (b: Int, c: Int)) => | ||
(a, b, c) | ||
} | ||
.write(TypedTsv[(Int, Int, Int)](args("output"))) | ||
} | ||
|
||
class ExperimentTest extends WordSpec with Matchers { | ||
"A PageRank2 job" should { | ||
JobTest(new com.twitter.scalding.ReduceValueCountJob(_)) | ||
.arg("output", "blah") | ||
.sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("blah")){ | ||
tuples => | ||
println("RES = " + tuples) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's remove println statements. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. Dup-test. Thanks for the catch |
||
} | ||
.run | ||
.finish | ||
} | ||
|
||
} | ||
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is bug by itself. Can you confirm this is still true and not due to another issue? We don't want to make a bunch of groups.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be a bug. if you simply put
flowProcess.increment("TestGroup", "TestKey", 1)
in def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]) {
and this should print 3 in the test
ReduceValueCounterTest. However it prints
PRINTING KEY AND GROUP! 0