In this tutorial, we will show you how to extract hyperlink from a HTML page. For example, to get the link from following content :
this is text1 <a href='mkyong.com' target='_blank'>hello</a> this is text2...
- First get the “value” from
atag – Result :a href='mkyong.com' target='_blank' - Later get the “link” from above extracted value – Result :
mkyong.com
1. Regular Expression Pattern
Extract A tag Regular Expression Pattern
(?i)<a([^>]+)>(.+?)</a>
Extract Link From A tag Regular Expression Pattern
\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));
Description
( #start of group #1
?i # all checking are case insensive
) #end of group #1
<a #start with "<a"
( # start of group #2
[^>]+ # anything except (">"), at least one character
) # end of group #2
> # follow by ">"
(.+?) # match anything
</a> # end with "</a>
\s* #can start with whitespace
(?i) # all checking are case insensive
href # follow by "href" word
\s*=\s* # allows spaces on either side of the equal sign,
( # start of group #1
"([^"]*") # allow string with double quotes enclosed - "string"
| # ..or
'[^']*' # allow string with single quotes enclosed - 'string'
| # ..or
([^'">]+) # can't contains one single quotes, double quotes ">"
) # end of group #1
2. Java Link Extractor Example
Here’s a simple Java Link extractor example, to extract the a tag value from 1st pattern, and use 2nd pattern to extract the link from 1st pattern.
HTMLLinkExtractor.java
package com.mkyong.crawler.core;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HTMLLinkExtractor {
private Pattern patternTag, patternLink;
private Matcher matcherTag, matcherLink;
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
private static final String HTML_A_HREF_TAG_PATTERN =
"\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public HTMLLinkExtractor() {
patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
/**
* Validate html with regular expression
*
* @param html
* html content for validation
* @return Vector links and link text
*/
public Vector<HtmlLink> grabHTMLLinks(final String html) {
Vector<HtmlLink> result = new Vector<HtmlLink>();
matcherTag = patternTag.matcher(html);
while (matcherTag.find()) {
String href = matcherTag.group(1); // href
String linkText = matcherTag.group(2); // link text
matcherLink = patternLink.matcher(href);
while (matcherLink.find()) {
String link = matcherLink.group(1); // link
HtmlLink obj = new HtmlLink();
obj.setLink(link);
obj.setLinkText(linkText);
result.add(obj);
}
}
return result;
}
class HtmlLink {
String link;
String linkText;
HtmlLink(){};
@Override
public String toString() {
return new StringBuffer("Link : ").append(this.link)
.append(" Link Text : ").append(this.linkText).toString();
}
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = replaceInvalidChar(link);
}
public String getLinkText() {
return linkText;
}
public void setLinkText(String linkText) {
this.linkText = linkText;
}
private String replaceInvalidChar(String link){
link = link.replaceAll("'", "");
link = link.replaceAll("\"", "");
return link;
}
}
}
3. Unit Test
Unit test with TestNG. Simulate the HTML content via @DataProvider.
TestHTMLLinkExtractor.java
package com.mkyong.crawler.core;
import java.util.Vector;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import com.mkyong.crawler.core.HTMLLinkExtractor.HtmlLink;
/**
* HTML link extrator Testing
*
* @author mkyong
*
*/
public class TestHTMLLinkExtractor {
private HTMLLinkExtractor htmlLinkExtractor;
String TEST_LINK = "http://www.google.com";
@BeforeClass
public void initData() {
htmlLinkExtractor = new HTMLLinkExtractor();
}
@DataProvider
public Object[][] HTMLContentProvider() {
return new Object[][] {
new Object[] { "abc hahaha <a href='" + TEST_LINK + "'>google</a>" },
new Object[] { "abc hahaha <a HREF='" + TEST_LINK + "'>google</a>" },
new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "'>google</A> , "
+ "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] { "abc hahaha <A target='_blank' HREF='" + TEST_LINK + "'>google</A>" },
new Object[] { "abc hahaha <A target='_blank' HREF=\"" + TEST_LINK + "\">google</A>" },
new Object[] { "abc hahaha <a HREF=" + TEST_LINK + ">google</a>" }, };
}
@Test(dataProvider = "HTMLContentProvider")
public void ValidHTMLLinkTest(String html) {
Vector<HtmlLink> links = htmlLinkExtractor.grabHTMLLinks(html);
//there must have something
Assert.assertTrue(links.size() != 0);
for (int i = 0; i < links.size(); i++) {
HtmlLink htmlLinks = links.get(i);
//System.out.println(htmlLinks);
Assert.assertEquals(htmlLinks.getLink(), TEST_LINK);
}
}
}
Result
[TestNG] Running:
/private/var/folders/w8/jxyz5pf51lz7nmqm_hv5z5br0000gn/T/testng-eclipse--530204890/testng-customsuite.xml
PASSED: ValidHTMLLinkTest("abc hahaha <a href='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com'>google</A> , abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF="http://www.google.com">google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF=http://www.google.com>google</a>")
Pingback: Bed&Breakfast traditional travel()
Pingback: keypad gun safe()
Pingback: Christmas fireplace Video()
Pingback: Christmas fireplace Video()
Pingback: casino games()
Pingback: happy hanukkah wishes()
Pingback: useful source()
Pingback: question voyance()
Pingback: Live Suite Pro Bonus()
Pingback: https://www.youtube.com/watch?v=G-I_c_Cmr9Y()
Pingback: Key Finders()
Pingback: oil vape pens()
Pingback: vape pen()
Pingback: Best Key Finders()
Pingback: forskolin()
Pingback: car entertainment system()