Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ lazy val root = (project in file("."))
.settings(commons: _*)
.settings(
libraryDependencies ++= Seq(
"org.specs2" %% "specs2" % "3.7",
"com.machinepublishers" % "jbrowserdriver" % "0.14.7",
"org.jsoup" % "jsoup" % "1.9.2",
"com.typesafe.scala-logging" % "scala-logging_2.11" % "3.4.0",
"ch.qos.logback" % "logback-classic" % "1.1.7",
"com.rockymadden.stringmetric" % "stringmetric-core_2.11" % "0.27.4",
"io.github.lukehutch" % "fast-classpath-scanner" % "1.9.21"
"io.github.lukehutch" % "fast-classpath-scanner" % "1.9.19"
)
)
)
6 changes: 4 additions & 2 deletions src/main/scala/eu/unicredit/web/HtmlExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@ class VisualTagTreeBuilder(headless: Boolean = true, quickRender: Boolean = true
.split(";")
.filter(_.contains("::"))
.map(_.split("::"))
.map {
.flatMap {
case Array(prop, value) =>
prop -> value
Some(prop -> value)
case Array(prop) =>
Some(prop -> "")
}
.toMap
.filterNot(_._1.startsWith("-webkit"))
Expand Down
4 changes: 3 additions & 1 deletion src/main/scala/eu/unicredit/web/Models.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ import org.jsoup.Jsoup
import scala.annotation.tailrec
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.util.{ Failure, Success, Try }
import scala.util.Try


/**
* Created by fabiofumarola on 24/05/16.
Expand Down Expand Up @@ -125,6 +126,7 @@ object Models {
from: Seq[WebList] = Seq.empty) {
lazy val urls = elements.flatMap(_.urls)
lazy val bfs = elements.flatMap(_.bfs)

}

}
Expand Down
53 changes: 53 additions & 0 deletions src/main/scala/eu/unicredit/web/hylien/Distances.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eu.unicredit.web.hylien

import com.rockymadden.stringmetric.similarity._
import eu.unicredit.web.Models.DomNode

import scala.collection.mutable
import scala.util.Try
Expand Down Expand Up @@ -33,6 +34,58 @@ object Distances {
Encoder.encode(b).toArray)
}

/**
* Implement the simple tree matching algorithm
*
* @param a
* @param b
* @return
*/
def treeEditDistance (a: DomNode, b:DomNode): Double = {

a.tagName.equals(b.tagName) match {
case false => 0D
case true =>
val num_rows = a.children.size + 1
val num_columns = b.children.size + 1
val matchMatrix = Array.ofDim[Double](num_rows, num_columns)

//Initialize 0th row and 0th column
matchMatrix.indices.foreach(row => matchMatrix(row)(0) = 0D)
matchMatrix(0).indices.foreach(column => matchMatrix(0)(column) = 0D)

val pairs = for{
row <- 1 until num_rows
column <- 1 until num_columns
} yield (row, column)

pairs.foreach {
case (row, column) =>
val left_distance = matchMatrix(row)(column - 1)
val up_distance = matchMatrix(row - 1)(column)
val diagonal_distance = matchMatrix(row - 1)(column - 1) + treeEditDistance(a.children(row - 1), b.children(column - 1))
val bestDistance = List(left_distance, up_distance, diagonal_distance).max
matchMatrix(row)(column) = bestDistance
}
1D + matchMatrix(matchMatrix.length - 1)(matchMatrix(0).length - 1)
}
}

def normalizedTreeEditDistance (a: DomNode, b:DomNode) : Double = {
def getSize0(nodes: List[DomNode], acc:Int): Int = nodes match {
case List() => acc
case h::tail => getSize0(h.children.toList ++ tail, acc+1)
}

def getSize(tree: DomNode): Int = {
getSize0(List(tree), 0)
}

val ted = treeEditDistance(a,b)
val avgNodes = (getSize(a) + getSize(b)).toDouble /2
1- (ted.toDouble / avgNodes)
}

}

object Encoder {
Expand Down
6 changes: 4 additions & 2 deletions src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ class VisualHyLiEn(headless: Boolean = true, quickRender: Boolean = true,
logReqs = logReqs,
browserSize = browserSize)

def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 30): Seq[WebList] = {
def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 60): Seq[WebList] = {
val startTime = System.currentTimeMillis()
val root = webExtractor.parse(url)
logger.debug(s"parsed $url, start extracting lists")
val totalTime = System.currentTimeMillis() - startTime
logger.info(s"parsed $url in $totalTime millisec, start extracting lists")

@tailrec
def extract0(notAligned: List[DomNode], acc: List[WebList]): List[WebList] =
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ private[this] object VisualListFinder {
//take the head and for the tail filter all the elements similar to the head
case head :: tail =>
head :: tail.filter { n =>
val dist = Distances.normalizedEditDistance(head.bfs, n.bfs)
//val dist = Distances.normalizedEditDistance(head.bfs, n.bfs)
val dist = Distances.normalizedTreeEditDistance(head, n)
if (dist > minsim) nonSimilar = n :: nonSimilar
dist <= minsim
}
Expand Down
24 changes: 0 additions & 24 deletions src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import scala.collection.mutable
*/
object VisualTagTreeBuilderTest extends App {

// val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc"
// val url = "https://www.stanford.edu/"
val url = "http://www.bsvillage.com/Piscine-Fuori-Terra/"

val time2 = System.currentTimeMillis()
Expand All @@ -19,35 +17,13 @@ object VisualTagTreeBuilderTest extends App {
println(s"page parsed into ${System.currentTimeMillis() - time2}")
parser1.close()

// var time1 = System.currentTimeMillis()
// val visual1 = new VisualWebExtractor(true, true)
// (1 to 10).foreach(_ => visual1.parse(url))
// println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}")

// val time3 = System.currentTimeMillis()
// val parser2 = new VisualWebExtractor(true, true)
// parser2.parse(url)
// println(s"page parsed into ${System.currentTimeMillis() - time3}")
// parser2.close()

println(root)

}

object TagTreeBuilderTest extends App {
// val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc"
// val url = "https://www.stanford.edu/"
val url = "http://www.cs.illinois.edu/directory/faculty"

// var time1 = System.currentTimeMillis()
// val visual1 = new TagTreeBuilder()
// (1 to 10).foreach(_ => visual1.parse(url))
// println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}")
//
// val time2 = System.currentTimeMillis()
// val root = new TagTreeBuilder().parse(url)
// println(s"page parsed into ${System.currentTimeMillis() - time2}")

val time3 = System.currentTimeMillis()
val root = new TagTreeBuilder().parse(url)
println(s"page parsed into ${System.currentTimeMillis() - time3}")
Expand Down
19 changes: 4 additions & 15 deletions src/test/scala/eu/unicredit/web/HyLiEnTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,7 @@ object HyLiEnTest extends App {
headless = true, quickRender = true,
logReqs = false, browserSize = BrowserSize(1920, 1080))

val lists = hylien.extract("http://www.immobiliare.it/44602950-Vendita-Bilocale-via-Pola-2-Milano.html")


//("http://www.cs.illinois.edu/directory/faculty?quicktabs_faculty_tabs_new=1#quicktabs-faculty_tabs_new")

//("http://www.cs.illinois.edu")

//("https://it.wikipedia.org/wiki/Fiat_Chrysler_Automobiles")
//("http://www.bsvillage.com/Piscine-Fuori-Terra/")

//("http://www.cs.illinois.edu/directory/faculty")

//("http://www.cs.ox.ac.uk/")

// //("http://www.harvard.edu/") //("http://cs.stanford.edu/")
val lists = hylien.extract("http://www.idealista.it/vendita-case/milano-milano/")

println(s"Got ${lists.size} lists")
lists.foreach { l =>
Expand All @@ -38,6 +24,7 @@ object HyLiEnTest extends App {

def toString(l: WebList): String = {
val buf = new StringBuilder

buf ++= s"Printing ${l.orientation} of by ${l.elements.size} elements obtained merging ${l.from.size} lists \n"
buf ++= s"parent dom tag = ${l.parent.tagName}\n"
buf ++= s"location = ${l.location} \n"
Expand Down Expand Up @@ -65,4 +52,6 @@ object HyLiEnTest extends App {

buf.toString()
}


}
Loading