@@ -16,116 +16,129 @@ module.exports = (async function(seq) {
1616
1717 /********************* Scrape from CAS DESIGNER ********************/
1818
19- await page . goto ( CAS_DESIGNER ) ;
20- await page . type ( "#query_seq" , seq ) ;
19+ try {
20+ await page . goto ( CAS_DESIGNER ) ;
21+ await page . type ( "#query_seq" , seq ) ;
22+
23+ await Promise . all ( [
24+ page . waitForNavigation ( ) ,
25+ page . click ( '.btn-primary' )
26+ ] ) ;
2127
22-
23- await Promise . all ( [
24- page . waitForNavigation ( ) ,
25- page . click ( '.btn-primary' )
26- ] ) ;
27-
28- // wait until entire job finished..
29- await page . waitForSelector ( "#results #table_res > tbody > tr:not([style*='display: none'])" ) ;
30-
31- const rgenResults = await page . $$ ( "#results #table_res > tbody > tr:not([style*='display: none'])" ) ;
32-
33- for ( rgenResult of rgenResults ) {
34- const rgenPol = await rgenResult . $eval ( "td:nth-child(4)" ,
35- ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No polarity" ) ) ;
28+ // wait until entire job finished..
29+ await page . waitForSelector ( "#results #table_res > tbody > tr:not([style*='display: none'])" ) ;
30+
31+ const rgenResults = await page . $$ ( "#results #table_res > tbody > tr:not([style*='display: none'])" ) ;
32+
33+ for ( rgenResult of rgenResults ) {
34+ const rgenPol = await rgenResult . $eval ( "td:nth-child(4)" ,
35+ ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No polarity" ) ) ;
36+
37+ let rgenScore = await rgenResult . $eval ( "td:nth-child(6) > a" , ( el ) => ( el . textContent ) ) . catch ( ( err ) => console . error ( "No score" ) ) ;
38+ rgenScore = parseFloat ( rgenScore ) ;
39+ if ( ! isNaN ( rgenScore ) && rgenPol === "+" && rgenScore >= 66.0 ) {
40+ // also going to filter out 'greyed out' ones
41+ const rgenSeq = await rgenResult . $eval ( "td:nth-child(1) span span:not([class*='tt'])" , ( el ) => el . textContent . slice ( 0 , - 3 ) ) . catch ( ( err ) => console . error ( "no rgenome sequence?" , err ) ) ;
3642
37- let rgenScore = await rgenResult . $eval ( "td:nth-child(6) > a" , ( el ) => ( el . textContent ) ) . catch ( ( err ) => console . error ( "No score" ) ) ;
38- rgenScore = parseFloat ( rgenScore ) ;
39- if ( ! isNaN ( rgenScore ) && rgenPol === "+" && rgenScore >= 66.0 ) {
40- // also going to filter out 'greyed out' ones
41- const rgenSeq = await rgenResult . $eval ( "td:nth-child(1) span span:not([class*='tt'])" , ( el ) => el . textContent . slice ( 0 , - 3 ) ) . catch ( ( err ) => console . error ( "no rgenome sequence?" , err ) ) ;
42-
43- if ( rgenSeq in targets ) {
44- let entry = { ...targets [ rgenSeq ] , ...{ rgenome : page . url ( ) } } ;
45- targets [ rgenSeq ] = entry ;
46- } else {
47- targets [ rgenSeq ] = { rgenome : page . url ( ) } ;
43+ if ( rgenSeq in targets ) {
44+ let entry = { ...targets [ rgenSeq ] , ...{ rgenome : page . url ( ) } } ;
45+ targets [ rgenSeq ] = entry ;
46+ } else {
47+ targets [ rgenSeq ] = { rgenome : page . url ( ) } ;
48+ }
4849 }
4950 }
51+ } catch ( err ) {
52+ console . error ( "rgenome err" , err ) ;
5053 }
51-
5254
55+
5356
5457
58+
5559
5660
5761
5862
5963
60- /********************* Scrape from CCTOP ********************/
6164
62- await page . goto ( CCTOP ) ;
63- await page . type ( "#seq" , seq ) ;
64- await page . type ( "#demo_q" , "hg38" ) ;
65- await page . waitForXPath ( "//*[@id='hg38_anchor']" ) ;
66- await page . evaluate ( ( ) => {
67- [ ...document . querySelectorAll ( ".jstree-search" ) ] . find ( el => el . textContent === "Human (Homo sapiens GRCh38/hg38)" ) . click ( ) ;
68- } ) ;
69- //
70- await page . click ( "#subm" ) ;
7165
72-
7366
67+ /********************* Scrape from CCTOP ********************/
7468
75- await page . waitForSelector ( "iframe[src='unnamed_frame.html']" , {
76- visible : true ,
77- timeout : 0
78- } ) ;
69+ try {
70+
71+ await page . goto ( CCTOP ) ;
72+ await page . type ( "#seq" , seq ) ;
73+ await page . type ( "#demo_q" , "hg38" ) ;
74+ await page . waitForXPath ( "//*[@id='hg38_anchor']" ) ;
75+ await page . evaluate ( ( ) => {
76+ [ ...document . querySelectorAll ( ".jstree-search" ) ] . find ( el => el . textContent === "Human (Homo sapiens GRCh38/hg38)" ) . click ( ) ;
77+ } ) ;
78+ //
79+ await page . click ( "#subm" ) ;
7980
81+
8082
81-
82- // this is so that every frame is loaded...
83- await page . waitFor ( 5000 ) ;
84-
8583
86- const cctopFrame = await page . frames ( ) . find ( f => f . name ( ) === "iframe_targets" ) ;
87-
88-
89-
90- const cctopIframe = await page . waitForFunction (
91- "document.querySelector('#middleColumn > iframe').contentDocument.body.innerHTML"
92- ) ;
93-
94-
84+ await page . waitForSelector ( "iframe[src='unnamed_frame.html']" , {
85+ visible : true ,
86+ timeout : 0
87+ } ) ;
88+
89+
9590
96- let cctopHTML =
97- "<html><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'> <style type='text/css'> td.mono {font-family: 'Courier New', Courier, monospace}table.fancyTable, tr.fancyTable, td.fancyTable{border: 1px solid black;border-collapse: collapse;}.moveimage { position: relative; top: 1px;}.tooltip { font-size: 10px; display: none; position: absolute; border: 1px solid #cccccc; padding: 0px 5px; background-color: #f2f2f2; text-align: justify; box-shadow: 5px 5px 8px #AAA; width: 300px; line-height: 10px;}.hover:hover .tooltip { z-index: 10; text-decoration: none; display: block; position: absolute;}</style></head><body>"
98- + cctopIframe . toString ( ) + "</body></html>" ;
91+ // this is so that every frame is loaded...
92+ await page . waitFor ( 5000 ) ;
9993
100-
101-
102-
103- await page . setContent ( cctopHTML ) ;
104-
105-
106- const cctopResults = await page . $$ ( "body > table:nth-of-type(4n+2)" ) ;
107- const test = await page . $$ ( "body > table" ) ;
108-
109-
110-
111- for ( cctopResult of cctopResults ) {
112- const cctopScore = await cctopResult . $eval ( "tbody tr:nth-child(2) td:nth-child(2)" , ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No efficacy score" ) ) ;
113- const cctopRating = cctopScore . split ( " " ) ;
11494
115- if ( cctopRating [ cctopRating . length - 1 ] === "HIGH" ) {
116-
117- const cctopSeq = await cctopResult . $eval ( "tbody > tr:nth-child(1) > td.mono" , ( el ) => el . textContent . slice ( 0 , - 3 ) ) . catch ( ( err ) => console . error ( "Error finding cctop seq" ) ) ;
95+ const cctopFrame = await page . frames ( ) . find ( f => f . name ( ) === "iframe_targets" ) ;
96+
97+
98+
99+ const cctopIframe = await page . waitForFunction (
100+ "document.querySelector('#middleColumn > iframe').contentDocument.body.innerHTML"
101+ ) ;
102+
103+
104+
105+ let cctopHTML =
106+ "<html><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'> <style type='text/css'> td.mono {font-family: 'Courier New', Courier, monospace}table.fancyTable, tr.fancyTable, td.fancyTable{border: 1px solid black;border-collapse: collapse;}.moveimage { position: relative; top: 1px;}.tooltip { font-size: 10px; display: none; position: absolute; border: 1px solid #cccccc; padding: 0px 5px; background-color: #f2f2f2; text-align: justify; box-shadow: 5px 5px 8px #AAA; width: 300px; line-height: 10px;}.hover:hover .tooltip { z-index: 10; text-decoration: none; display: block; position: absolute;}</style></head><body>"
107+ + cctopIframe . toString ( ) + "</body></html>" ;
108+
109+
110+
111+
112+ await page . setContent ( cctopHTML ) ;
113+
114+
115+ const cctopResults = await page . $$ ( "body > table:nth-of-type(4n+2)" ) ;
116+ await page . $$ ( "body > table" ) ;
117+
118+
119+
120+ for ( cctopResult of cctopResults ) {
121+ const cctopScore = await cctopResult . $eval ( "tbody tr:nth-child(2) td:nth-child(2)" , ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No efficacy score" ) ) ;
122+ const cctopRating = cctopScore . split ( " " ) ;
118123
119- if ( cctopSeq in targets ) {
124+ if ( cctopRating [ cctopRating . length - 1 ] === "HIGH" ) {
120125
121- let entry = { ...targets [ cctopSeq ] , ...{ cctop : page . url ( ) } } ;
122- targets [ cctopSeq ] = entry ;
123- } else {
124- targets [ cctopSeq ] = { cctop : page . url ( ) } ;
126+ const cctopSeq = await cctopResult . $eval ( "tbody > tr:nth-child(1) > td.mono" , ( el ) => el . textContent . slice ( 0 , - 3 ) ) . catch ( ( err ) => console . error ( "Error finding cctop seq" ) ) ;
127+
128+ if ( cctopSeq in targets ) {
129+
130+ let entry = { ...targets [ cctopSeq ] , ...{ cctop : page . url ( ) } } ;
131+ targets [ cctopSeq ] = entry ;
132+ } else {
133+ targets [ cctopSeq ] = { cctop : page . url ( ) } ;
134+ }
125135 }
126136 }
137+ } catch ( err ) {
138+ console . error ( "cctop err" , err ) ;
127139 }
128140
141+
129142
130143
131144
@@ -135,54 +148,60 @@ module.exports = (async function(seq) {
135148
136149 /********************* Scrape from CRISPR Direct ********************/
137150
138- await page . goto ( CRIPSR_DIRECT ) ;
139- // replace sample sequence with our own
140- await page . $eval ( "#useq" , el => el . value = "" ) ;
141- await page . type ( "#useq" , seq ) ;
142-
143- //select correct specificity (hg38) from dropdown menu
144- await page . click ( "#ext-gen1698" ) ;
145- await page . evaluate ( ( ) => {
146- [ ... document . querySelectorAll ( ".x-boundlist-item" ) ] . find ( el => el . textContent === "Human (Homo sapiens) genome, GRCh38/hg38 (Dec, 2013)" ) . click ( ) ;
147- } ) ;
148- // design with our parameters
149- await page . click ( ".zbutton" ) ;
150-
151- // wait until target sequences show up first
152- await page . waitForSelector ( "select[name='result_length']" ) ;
153- // just show all entries in one page for easier scraping
154- await page . select ( "select[name='result_length']" , "-1" ) ;
155- // but only show highly specific target only
156- await page . evaluate ( ( ) => {
157- document . querySelector ( "#filter_highlight" ) . parentElement . click ( ) ;
158- } ) ;
159-
160-
161- // scrape target sequences (excluding PAM) for +
162- const CDResults = await page . $$ ( ".dataTable tbody tr" ) ;
163-
164-
151+ try {
152+ await page . goto ( CRIPSR_DIRECT ) ;
153+ // replace sample sequence with our own
154+ await page . $eval ( "#useq" , el => el . value = "" ) ;
155+ await page . type ( "#useq" , seq ) ;
156+
157+ //select correct specificity (hg38) from dropdown menu
158+ await page . click ( ".x-form-arrow-trigger" ) ;
159+ await page . evaluate ( ( ) => {
160+ [ ... document . querySelectorAll ( ".x-boundlist-item" ) ] . find ( el => el . textContent === "Human (Homo sapiens) genome, GRCh38/hg38 (Dec, 2013)" ) . click ( ) ;
161+ } ) ;
162+ // design with our parameters
163+ await page . click ( ".zbutton" ) ;
164+
165+ // wait until target sequences show up first
166+ await page . waitForSelector ( "select[name='result_length']" ) ;
167+ // just show all entries in one page for easier scraping
168+ await page . select ( "select[name='result_length']" , "-1" ) ;
169+ // but only show highly specific target only
170+ await page . evaluate ( ( ) => {
171+ document . querySelector ( "#filter_highlight" ) . parentElement . click ( ) ;
172+ } ) ;
173+
174+
175+ // scrape target sequences (excluding PAM) for +
176+ const CDResults = await page . $$ ( ".dataTable tbody tr" ) ;
177+
165178
166- for ( let CDResult of CDResults ) {
167179
168- const rawPolarity = await CDResult . $eval ( "td.v:nth-child(2)" , ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No polarity" ) ) ;
169- const polarity = rawPolarity . replace ( / ^ \s + | \s + $ | \s + (? = \s ) / g, "" ) ;
170- if ( polarity === "+" ) {
171- const cdSeq = await CDResult . $eval ( "td.v:nth-child(3) .mono" , ( el ) => el . textContent . slice ( 0 , - 3 ) . toUpperCase ( ) ) . catch ( ( err ) => console . error ( "No sequence" ) ) ;
180+ for ( let CDResult of CDResults ) {
172181
173- if ( cdSeq in targets ) {
174- let entry = { ...targets [ cdSeq ] , ...{ Crispr_Direct : page . url ( ) } } ;
175- targets [ cdSeq ] = entry ;
176- } else {
177- targets [ cdSeq ] = { Crispr_Direct : page . url ( ) } ;
182+ const rawPolarity = await CDResult . $eval ( "td.v:nth-child(2)" , ( el ) => el . textContent ) . catch ( ( err ) => console . error ( "No polarity" ) ) ;
183+ const polarity = rawPolarity . replace ( / ^ \s + | \s + $ | \s + (? = \s ) / g, "" ) ;
184+ if ( polarity === "+" ) {
185+ const cdSeq = await CDResult . $eval ( "td.v:nth-child(3) .mono" , ( el ) => el . textContent . slice ( 0 , - 3 ) . toUpperCase ( ) ) . catch ( ( err ) => console . error ( "No sequence" ) ) ;
186+
187+ if ( cdSeq in targets ) {
188+ let entry = { ...targets [ cdSeq ] , ...{ Crispr_Direct : page . url ( ) } } ;
189+ targets [ cdSeq ] = entry ;
190+ } else {
191+ targets [ cdSeq ] = { Crispr_Direct : page . url ( ) } ;
192+ }
178193 }
179194 }
195+ } catch ( err ) {
196+ console . error ( "crispr direct err" , err ) ;
180197 }
181198
182- console . log ( "targets after cd direct as well" , targets ) ;
199+
200+
183201
184202
185203 /********************* Scrape from CRISPOR ********************/
204+ try {
186205 await page . goto ( CRISPOR ) ;
187206 await page . $eval ( "textarea[name='seq']" , el => el . value = "" ) ;
188207 await page . type ( "textarea[name='seq']" , seq ) ;
@@ -220,8 +239,13 @@ module.exports = (async function(seq) {
220239 break ;
221240 }
222241 }
242+ } catch ( err ) {
243+ console . error ( "crispor err" , err ) ;
244+ }
245+
246+
223247
224- browser . close ( ) ;
248+ browser . close ( ) ;
225249
226250 delete targets [ "undefined" ] ;
227251
0 commit comments